Repository: ZTO-Express/fire Branch: main Commit: f3984f90fc77 Files: 728 Total size: 4.6 MB Directory structure: gitextract_kfsomxpy/ ├── .gitignore ├── LICENSE ├── README.md ├── docs/ │ ├── accumulator.md │ ├── anno.md │ ├── connector/ │ │ ├── adb.md │ │ ├── clickhouse.md │ │ ├── hbase.md │ │ ├── hive.md │ │ ├── jdbc.md │ │ ├── kafka.md │ │ ├── oracle.md │ │ └── rocketmq.md │ ├── datasource.md │ ├── dev/ │ │ ├── config.md │ │ ├── deploy-script.md │ │ ├── engine-env.md │ │ └── integration.md │ ├── feature.md │ ├── highlight/ │ │ ├── checkpoint.md │ │ └── spark-duration.md │ ├── index.md │ ├── platform.md │ ├── pom/ │ │ ├── flink-pom.xml │ │ └── spark-pom.xml │ ├── properties.md │ ├── restful.md │ ├── schedule.md │ └── threadpool.md ├── fire-common/ │ ├── pom.xml │ └── src/ │ ├── main/ │ │ ├── java/ │ │ │ └── com/ │ │ │ └── zto/ │ │ │ └── fire/ │ │ │ └── common/ │ │ │ ├── anno/ │ │ │ │ ├── Config.java │ │ │ │ ├── FieldName.java │ │ │ │ ├── FireConf.java │ │ │ │ ├── Internal.java │ │ │ │ ├── Rest.java │ │ │ │ ├── Scheduled.java │ │ │ │ └── TestStep.java │ │ │ ├── bean/ │ │ │ │ ├── FireTask.java │ │ │ │ ├── analysis/ │ │ │ │ │ └── ExceptionMsg.java │ │ │ │ ├── config/ │ │ │ │ │ └── ConfigurationParam.java │ │ │ │ ├── lineage/ │ │ │ │ │ ├── Lineage.java │ │ │ │ │ ├── SQLLineage.java │ │ │ │ │ ├── SQLTable.java │ │ │ │ │ ├── SQLTableColumns.java │ │ │ │ │ ├── SQLTablePartitions.java │ │ │ │ │ └── SQLTableRelations.java │ │ │ │ ├── rest/ │ │ │ │ │ ├── ResultMsg.java │ │ │ │ │ └── yarn/ │ │ │ │ │ └── App.java │ │ │ │ └── runtime/ │ │ │ │ ├── ClassLoaderInfo.java │ │ │ │ ├── CpuInfo.java │ │ │ │ ├── DiskInfo.java │ │ │ │ ├── DisplayInfo.java │ │ │ │ ├── HardwareInfo.java │ │ │ │ ├── JvmInfo.java │ │ │ │ ├── MemoryInfo.java │ │ │ │ ├── NetworkInfo.java │ │ │ │ ├── OSInfo.java │ │ │ │ ├── RuntimeInfo.java │ │ │ │ ├── ThreadInfo.java │ │ │ │ └── UsbInfo.java │ │ │ ├── enu/ │ │ │ │ ├── ConfigureLevel.java │ │ │ │ ├── Datasource.java │ │ │ │ ├── ErrorCode.java │ │ │ │ ├── JdbcDriver.java │ │ │ │ ├── JobType.java │ │ │ │ ├── Operation.java │ │ │ │ ├── RequestMethod.scala │ │ │ │ ├── ThreadPoolType.java │ │ │ │ └── YarnState.java │ │ │ ├── exception/ │ │ │ │ ├── FireException.java │ │ │ │ ├── FireFlinkException.java │ │ │ │ └── FireSparkException.java │ │ │ └── util/ │ │ │ ├── EncryptUtils.java │ │ │ ├── FileUtils.java │ │ │ ├── FindClassUtils.java │ │ │ ├── HttpClientUtils.java │ │ │ ├── IOUtils.java │ │ │ ├── MathUtils.java │ │ │ ├── OSUtils.java │ │ │ ├── ProcessUtil.java │ │ │ ├── ReflectionUtils.java │ │ │ ├── StringsUtils.java │ │ │ ├── UnitFormatUtils.java │ │ │ └── YarnUtils.java │ │ ├── resources/ │ │ │ └── log4j.properties │ │ └── scala/ │ │ └── com/ │ │ └── zto/ │ │ └── fire/ │ │ └── common/ │ │ ├── bean/ │ │ │ └── TableIdentifier.scala │ │ ├── conf/ │ │ │ ├── FireConf.scala │ │ │ ├── FireFrameworkConf.scala │ │ │ ├── FireHDFSConf.scala │ │ │ ├── FireHiveConf.scala │ │ │ ├── FireKafkaConf.scala │ │ │ ├── FirePS1Conf.scala │ │ │ ├── FireRocketMQConf.scala │ │ │ └── KeyNum.scala │ │ ├── ext/ │ │ │ ├── JavaExt.scala │ │ │ └── ScalaExt.scala │ │ ├── package.scala │ │ └── util/ │ │ ├── ConfigurationCenterManager.scala │ │ ├── DateFormatUtils.scala │ │ ├── ExceptionBus.scala │ │ ├── FireFunctions.scala │ │ ├── FireUtils.scala │ │ ├── JSONUtils.scala │ │ ├── JavaTypeMap.scala │ │ ├── KafkaUtils.scala │ │ ├── LineageManager.scala │ │ ├── LogUtils.scala │ │ ├── Logging.scala │ │ ├── MQProducer.scala │ │ ├── NumberFormatUtils.scala │ │ ├── PropUtils.scala │ │ ├── RegularUtils.scala │ │ ├── SQLLineageManager.scala │ │ ├── SQLUtils.scala │ │ ├── ScalaUtils.scala │ │ ├── ShutdownHookManager.scala │ │ ├── ThreadUtils.scala │ │ ├── Tools.scala │ │ └── ValueUtils.scala │ └── test/ │ └── scala/ │ └── com/ │ └── zto/ │ └── fire/ │ └── common/ │ └── util/ │ ├── RegularUtilsUnitTest.scala │ ├── SQLUtilsTest.scala │ ├── ShutdownHookManagerTest.scala │ └── ValueUtilsTest.scala ├── fire-connectors/ │ ├── .gitignore │ ├── base-connectors/ │ │ ├── fire-hbase/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── main/ │ │ │ ├── java/ │ │ │ │ └── com/ │ │ │ │ └── zto/ │ │ │ │ └── fire/ │ │ │ │ └── hbase/ │ │ │ │ └── anno/ │ │ │ │ └── HConfig.java │ │ │ └── scala/ │ │ │ └── com/ │ │ │ └── zto/ │ │ │ └── fire/ │ │ │ └── hbase/ │ │ │ ├── HBaseConnector.scala │ │ │ ├── HBaseFunctions.scala │ │ │ ├── bean/ │ │ │ │ ├── HBaseBaseBean.java │ │ │ │ └── MultiVersionsBean.java │ │ │ ├── conf/ │ │ │ │ └── FireHBaseConf.scala │ │ │ └── utils/ │ │ │ └── HBaseUtils.scala │ │ ├── fire-jdbc/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── main/ │ │ │ ├── resources/ │ │ │ │ └── driver.properties │ │ │ └── scala/ │ │ │ └── com/ │ │ │ └── zto/ │ │ │ └── fire/ │ │ │ └── jdbc/ │ │ │ ├── JdbcConnector.scala │ │ │ ├── JdbcConnectorBridge.scala │ │ │ ├── JdbcFunctions.scala │ │ │ ├── conf/ │ │ │ │ └── FireJdbcConf.scala │ │ │ └── util/ │ │ │ └── DBUtils.scala │ │ └── pom.xml │ ├── flink-connectors/ │ │ ├── flink-clickhouse/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── main/ │ │ │ ├── java-flink-1.14/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── flink/ │ │ │ │ └── connector/ │ │ │ │ └── clickhouse/ │ │ │ │ ├── ClickHouseDynamicTableFactory.java │ │ │ │ ├── ClickHouseDynamicTableSink.java │ │ │ │ ├── ClickHouseDynamicTableSource.java │ │ │ │ ├── catalog/ │ │ │ │ │ ├── ClickHouseCatalog.java │ │ │ │ │ └── ClickHouseCatalogFactory.java │ │ │ │ ├── config/ │ │ │ │ │ ├── ClickHouseConfig.java │ │ │ │ │ └── ClickHouseConfigOptions.java │ │ │ │ ├── internal/ │ │ │ │ │ ├── AbstractClickHouseInputFormat.java │ │ │ │ │ ├── AbstractClickHouseOutputFormat.java │ │ │ │ │ ├── ClickHouseBatchInputFormat.java │ │ │ │ │ ├── ClickHouseBatchOutputFormat.java │ │ │ │ │ ├── ClickHouseShardInputFormat.java │ │ │ │ │ ├── ClickHouseShardOutputFormat.java │ │ │ │ │ ├── ClickHouseStatementFactory.java │ │ │ │ │ ├── common/ │ │ │ │ │ │ └── DistributedEngineFullSchema.java │ │ │ │ │ ├── connection/ │ │ │ │ │ │ └── ClickHouseConnectionProvider.java │ │ │ │ │ ├── converter/ │ │ │ │ │ │ ├── ClickHouseConverterUtils.java │ │ │ │ │ │ └── ClickHouseRowConverter.java │ │ │ │ │ ├── executor/ │ │ │ │ │ │ ├── ClickHouseBatchExecutor.java │ │ │ │ │ │ ├── ClickHouseExecutor.java │ │ │ │ │ │ └── ClickHouseUpsertExecutor.java │ │ │ │ │ ├── options/ │ │ │ │ │ │ ├── ClickHouseConnectionOptions.java │ │ │ │ │ │ ├── ClickHouseDmlOptions.java │ │ │ │ │ │ └── ClickHouseReadOptions.java │ │ │ │ │ └── partitioner/ │ │ │ │ │ ├── BalancedPartitioner.java │ │ │ │ │ ├── ClickHousePartitioner.java │ │ │ │ │ ├── HashPartitioner.java │ │ │ │ │ └── ShufflePartitioner.java │ │ │ │ ├── split/ │ │ │ │ │ ├── ClickHouseBatchBetweenParametersProvider.java │ │ │ │ │ ├── ClickHouseBetweenParametersProvider.java │ │ │ │ │ ├── ClickHouseParametersProvider.java │ │ │ │ │ ├── ClickHouseShardBetweenParametersProvider.java │ │ │ │ │ └── ClickHouseShardTableParametersProvider.java │ │ │ │ └── util/ │ │ │ │ ├── ClickHouseTypeUtil.java │ │ │ │ ├── ClickHouseUtil.java │ │ │ │ ├── FilterPushDownHelper.java │ │ │ │ └── SqlClause.java │ │ │ └── resources/ │ │ │ └── META-INF/ │ │ │ └── services/ │ │ │ └── org.apache.flink.table.factories.Factory │ │ ├── flink-es/ │ │ │ └── pom.xml │ │ ├── flink-rocketmq/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── main/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── rocketmq/ │ │ │ │ └── flink/ │ │ │ │ ├── RocketMQConfig.java │ │ │ │ ├── RocketMQSink.java │ │ │ │ ├── RocketMQSinkWithTag.java │ │ │ │ ├── RocketMQSource.java │ │ │ │ ├── RocketMQUtils.java │ │ │ │ ├── RunningChecker.java │ │ │ │ └── common/ │ │ │ │ ├── selector/ │ │ │ │ │ ├── DefaultTopicSelector.java │ │ │ │ │ ├── SimpleTopicSelector.java │ │ │ │ │ └── TopicSelector.java │ │ │ │ └── serialization/ │ │ │ │ ├── JsonSerializationSchema.java │ │ │ │ ├── KeyValueDeserializationSchema.java │ │ │ │ ├── KeyValueSerializationSchema.java │ │ │ │ ├── SimpleKeyValueDeserializationSchema.java │ │ │ │ ├── SimpleKeyValueSerializationSchema.java │ │ │ │ └── TagKeyValueSerializationSchema.java │ │ │ ├── java-flink-1.12/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── rocketmq/ │ │ │ │ └── flink/ │ │ │ │ ├── RocketMQSourceWithTag.java │ │ │ │ └── common/ │ │ │ │ └── serialization/ │ │ │ │ ├── JsonDeserializationSchema.java │ │ │ │ ├── SimpleTagKeyValueDeserializationSchema.java │ │ │ │ └── TagKeyValueDeserializationSchema.java │ │ │ ├── java-flink-1.13/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── rocketmq/ │ │ │ │ └── flink/ │ │ │ │ ├── RocketMQSourceWithTag.java │ │ │ │ └── common/ │ │ │ │ └── serialization/ │ │ │ │ ├── JsonDeserializationSchema.java │ │ │ │ ├── SimpleTagKeyValueDeserializationSchema.java │ │ │ │ └── TagKeyValueDeserializationSchema.java │ │ │ ├── java-flink-1.14/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── rocketmq/ │ │ │ │ └── flink/ │ │ │ │ ├── RocketMQSourceWithTag.java │ │ │ │ └── common/ │ │ │ │ └── serialization/ │ │ │ │ ├── JsonDeserializationSchema.java │ │ │ │ ├── SimpleTagKeyValueDeserializationSchema.java │ │ │ │ └── TagKeyValueDeserializationSchema.java │ │ │ ├── resources/ │ │ │ │ └── META-INF/ │ │ │ │ └── services/ │ │ │ │ └── org.apache.flink.table.factories.Factory │ │ │ └── scala/ │ │ │ └── com/ │ │ │ └── zto/ │ │ │ └── fire/ │ │ │ └── flink/ │ │ │ └── sql/ │ │ │ └── connector/ │ │ │ └── rocketmq/ │ │ │ ├── RocketMQDynamicTableFactory.scala │ │ │ ├── RocketMQDynamicTableSink.scala │ │ │ ├── RocketMQDynamicTableSource.scala │ │ │ └── RocketMQOptions.scala │ │ └── pom.xml │ ├── pom.xml │ └── spark-connectors/ │ ├── pom.xml │ ├── spark-hbase/ │ │ ├── pom.xml │ │ └── src/ │ │ └── main/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── hadoop/ │ │ │ └── hbase/ │ │ │ ├── client/ │ │ │ │ ├── ConnFactoryExtend.java │ │ │ │ └── ConnectionFactory.java │ │ │ └── spark/ │ │ │ ├── SparkSQLPushDownFilter.java │ │ │ ├── example/ │ │ │ │ └── hbasecontext/ │ │ │ │ ├── JavaHBaseBulkDeleteExample.java │ │ │ │ ├── JavaHBaseBulkGetExample.java │ │ │ │ ├── JavaHBaseBulkPutExample.java │ │ │ │ ├── JavaHBaseDistributedScan.java │ │ │ │ ├── JavaHBaseMapGetPutExample.java │ │ │ │ └── JavaHBaseStreamingBulkPutExample.java │ │ │ └── protobuf/ │ │ │ └── generated/ │ │ │ └── FilterProtos.java │ │ ├── protobuf/ │ │ │ └── Filter.proto │ │ ├── scala/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── hadoop/ │ │ │ └── hbase/ │ │ │ └── spark/ │ │ │ ├── BulkLoadPartitioner.scala │ │ │ ├── ByteArrayComparable.scala │ │ │ ├── ByteArrayWrapper.scala │ │ │ ├── ColumnFamilyQualifierMapKeyWrapper.scala │ │ │ ├── DefaultSource.scala │ │ │ ├── DynamicLogicExpression.scala │ │ │ ├── FamiliesQualifiersValues.scala │ │ │ ├── FamilyHFileWriteOptions.scala │ │ │ ├── HBaseContext.scala │ │ │ ├── HBaseDStreamFunctions.scala │ │ │ ├── HBaseRDDFunctions.scala │ │ │ ├── JavaHBaseContext.scala │ │ │ ├── KeyFamilyQualifier.scala │ │ │ ├── NewHBaseRDD.scala │ │ │ ├── datasources/ │ │ │ │ ├── Bound.scala │ │ │ │ ├── HBaseResources.scala │ │ │ │ ├── HBaseSparkConf.scala │ │ │ │ ├── SerializableConfiguration.scala │ │ │ │ └── package.scala │ │ │ └── example/ │ │ │ ├── hbasecontext/ │ │ │ │ ├── HBaseBulkDeleteExample.scala │ │ │ │ ├── HBaseBulkGetExample.scala │ │ │ │ ├── HBaseBulkPutExample.scala │ │ │ │ ├── HBaseBulkPutExampleFromFile.scala │ │ │ │ ├── HBaseBulkPutTimestampExample.scala │ │ │ │ ├── HBaseDistributedScanExample.scala │ │ │ │ └── HBaseStreamingBulkPutExample.scala │ │ │ └── rdd/ │ │ │ ├── HBaseBulkDeleteExample.scala │ │ │ ├── HBaseBulkGetExample.scala │ │ │ ├── HBaseBulkPutExample.scala │ │ │ ├── HBaseForeachPartitionExample.scala │ │ │ └── HBaseMapPartitionExample.scala │ │ ├── scala-spark-2.3/ │ │ │ └── apache/ │ │ │ └── hadoop/ │ │ │ └── hbase/ │ │ │ └── spark/ │ │ │ └── datasources/ │ │ │ └── HBaseTableScanRDD.scala │ │ ├── scala-spark-2.4/ │ │ │ └── apache/ │ │ │ └── hadoop/ │ │ │ └── hbase/ │ │ │ └── spark/ │ │ │ └── datasources/ │ │ │ └── HBaseTableScanRDD.scala │ │ ├── scala-spark-3.0/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ ├── hadoop/ │ │ │ │ └── hbase/ │ │ │ │ └── spark/ │ │ │ │ └── datasources/ │ │ │ │ └── HBaseTableScanRDD.scala │ │ │ └── spark/ │ │ │ └── deploy/ │ │ │ └── SparkHadoopUtil.scala │ │ ├── scala-spark-3.1/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ ├── hadoop/ │ │ │ │ └── hbase/ │ │ │ │ └── spark/ │ │ │ │ └── datasources/ │ │ │ │ └── HBaseTableScanRDD.scala │ │ │ └── spark/ │ │ │ └── deploy/ │ │ │ └── SparkHadoopUtil.scala │ │ ├── scala-spark-3.2/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ ├── hadoop/ │ │ │ │ └── hbase/ │ │ │ │ └── spark/ │ │ │ │ └── datasources/ │ │ │ │ └── HBaseTableScanRDD.scala │ │ │ └── spark/ │ │ │ └── deploy/ │ │ │ └── SparkHadoopUtil.scala │ │ └── scala-spark-3.3/ │ │ └── org/ │ │ └── apache/ │ │ ├── hadoop/ │ │ │ └── hbase/ │ │ │ └── spark/ │ │ │ └── datasources/ │ │ │ └── HBaseTableScanRDD.scala │ │ └── spark/ │ │ └── deploy/ │ │ └── SparkHadoopUtil.scala │ └── spark-rocketmq/ │ ├── pom.xml │ └── src/ │ └── main/ │ ├── java/ │ │ └── org/ │ │ └── apache/ │ │ └── rocketmq/ │ │ └── spark/ │ │ ├── OffsetCommitCallback.java │ │ ├── RocketMQConfig.java │ │ ├── TopicQueueId.java │ │ └── streaming/ │ │ ├── DefaultMessageRetryManager.java │ │ ├── MessageRetryManager.java │ │ ├── MessageSet.java │ │ ├── ReliableRocketMQReceiver.java │ │ └── RocketMQReceiver.java │ ├── scala/ │ │ └── org/ │ │ └── apache/ │ │ ├── rocketmq/ │ │ │ └── spark/ │ │ │ ├── CachedMQConsumer.scala │ │ │ ├── ConsumerStrategy.scala │ │ │ ├── LocationStrategy.scala │ │ │ ├── Logging.scala │ │ │ ├── OffsetRange.scala │ │ │ ├── RocketMqRDDPartition.scala │ │ │ └── RocketMqUtils.scala │ │ └── spark/ │ │ ├── sql/ │ │ │ └── rocketmq/ │ │ │ ├── CachedRocketMQConsumer.scala │ │ │ ├── CachedRocketMQProducer.scala │ │ │ ├── JsonUtils.scala │ │ │ ├── RocketMQConf.scala │ │ │ ├── RocketMQOffsetRangeLimit.scala │ │ │ ├── RocketMQOffsetReader.scala │ │ │ ├── RocketMQRelation.scala │ │ │ ├── RocketMQSink.scala │ │ │ ├── RocketMQSourceProvider.scala │ │ │ ├── RocketMQUtils.scala │ │ │ ├── RocketMQWriteTask.scala │ │ │ └── RocketMQWriter.scala │ │ └── streaming/ │ │ └── MQPullInputDStream.scala │ ├── scala-spark-2.3/ │ │ ├── org/ │ │ │ └── apache/ │ │ │ └── spark/ │ │ │ └── sql/ │ │ │ └── rocketmq/ │ │ │ ├── RocketMQSource.scala │ │ │ ├── RocketMQSourceOffset.scala │ │ │ └── RocketMQSourceRDDOffsetRange.scala │ │ └── org.apache.spark.streaming/ │ │ └── RocketMqRDD.scala │ ├── scala-spark-2.4/ │ │ ├── org/ │ │ │ └── apache/ │ │ │ └── spark/ │ │ │ └── sql/ │ │ │ └── rocketmq/ │ │ │ ├── RocketMQSource.scala │ │ │ ├── RocketMQSourceOffset.scala │ │ │ └── RocketMQSourceRDDOffsetRange.scala │ │ └── org.apache.spark.streaming/ │ │ └── RocketMqRDD.scala │ ├── scala-spark-3.0/ │ │ └── org/ │ │ └── apache/ │ │ └── spark/ │ │ ├── sql/ │ │ │ └── rocketmq/ │ │ │ ├── RocketMQSource.scala │ │ │ ├── RocketMQSourceOffset.scala │ │ │ └── RocketMQSourceRDD.scala │ │ └── streaming/ │ │ └── RocketMqRDD.scala │ ├── scala-spark-3.1/ │ │ └── org/ │ │ └── apache/ │ │ └── spark/ │ │ ├── sql/ │ │ │ └── rocketmq/ │ │ │ ├── RocketMQSource.scala │ │ │ ├── RocketMQSourceOffset.scala │ │ │ └── RocketMQSourceRDD.scala │ │ └── streaming/ │ │ └── RocketMqRDD.scala │ ├── scala-spark-3.2/ │ │ └── org/ │ │ └── apache/ │ │ └── spark/ │ │ ├── sql/ │ │ │ └── rocketmq/ │ │ │ ├── RocketMQSource.scala │ │ │ ├── RocketMQSourceOffset.scala │ │ │ └── RocketMQSourceRDD.scala │ │ └── streaming/ │ │ └── RocketMqRDD.scala │ └── scala-spark-3.3/ │ └── org/ │ └── apache/ │ └── spark/ │ ├── sql/ │ │ └── rocketmq/ │ │ ├── RocketMQSource.scala │ │ ├── RocketMQSourceOffset.scala │ │ └── RocketMQSourceRDD.scala │ └── streaming/ │ └── RocketMqRDD.scala ├── fire-core/ │ ├── pom.xml │ └── src/ │ └── main/ │ ├── java/ │ │ └── com/ │ │ └── zto/ │ │ └── fire/ │ │ └── core/ │ │ ├── TimeCost.java │ │ ├── anno/ │ │ │ ├── connector/ │ │ │ │ ├── HBase.java │ │ │ │ ├── HBase2.java │ │ │ │ ├── HBase3.java │ │ │ │ ├── HBase4.java │ │ │ │ ├── HBase5.java │ │ │ │ ├── Hive.java │ │ │ │ ├── Jdbc.java │ │ │ │ ├── Jdbc2.java │ │ │ │ ├── Jdbc3.java │ │ │ │ ├── Jdbc4.java │ │ │ │ ├── Jdbc5.java │ │ │ │ ├── Kafka.java │ │ │ │ ├── Kafka2.java │ │ │ │ ├── Kafka3.java │ │ │ │ ├── Kafka4.java │ │ │ │ ├── Kafka5.java │ │ │ │ ├── RocketMQ.java │ │ │ │ ├── RocketMQ2.java │ │ │ │ ├── RocketMQ3.java │ │ │ │ ├── RocketMQ4.java │ │ │ │ └── RocketMQ5.java │ │ │ └── lifecycle/ │ │ │ ├── After.java │ │ │ ├── Before.java │ │ │ ├── Handle.java │ │ │ ├── Process.java │ │ │ ├── Step1.java │ │ │ ├── Step10.java │ │ │ ├── Step11.java │ │ │ ├── Step12.java │ │ │ ├── Step13.java │ │ │ ├── Step14.java │ │ │ ├── Step15.java │ │ │ ├── Step16.java │ │ │ ├── Step17.java │ │ │ ├── Step18.java │ │ │ ├── Step19.java │ │ │ ├── Step2.java │ │ │ ├── Step3.java │ │ │ ├── Step4.java │ │ │ ├── Step5.java │ │ │ ├── Step6.java │ │ │ ├── Step7.java │ │ │ ├── Step8.java │ │ │ └── Step9.java │ │ ├── bean/ │ │ │ └── ArthasParam.java │ │ └── task/ │ │ ├── SchedulerManager.java │ │ ├── TaskRunner.java │ │ └── TaskRunnerQueue.java │ ├── resources/ │ │ ├── cluster.properties │ │ └── fire.properties │ └── scala/ │ └── com/ │ └── zto/ │ └── fire/ │ └── core/ │ ├── Api.scala │ ├── BaseFire.scala │ ├── conf/ │ │ └── AnnoManager.scala │ ├── connector/ │ │ └── Connector.scala │ ├── ext/ │ │ ├── BaseFireExt.scala │ │ └── Provider.scala │ ├── plugin/ │ │ ├── ArthasDynamicLauncher.scala │ │ ├── ArthasLauncher.scala │ │ └── ArthasManager.scala │ ├── rest/ │ │ ├── RestCase.scala │ │ ├── RestServerManager.scala │ │ └── SystemRestful.scala │ ├── sql/ │ │ ├── SqlExtensionsParser.scala │ │ └── SqlParser.scala │ ├── sync/ │ │ ├── LineageAccumulatorManager.scala │ │ ├── SyncEngineConf.scala │ │ └── SyncManager.scala │ ├── task/ │ │ └── FireInternalTask.scala │ └── util/ │ └── SingletonFactory.scala ├── fire-engines/ │ ├── .gitignore │ ├── fire-flink/ │ │ ├── .gitignore │ │ ├── pom.xml │ │ └── src/ │ │ └── main/ │ │ ├── java/ │ │ │ └── com/ │ │ │ └── zto/ │ │ │ └── fire/ │ │ │ └── flink/ │ │ │ ├── anno/ │ │ │ │ ├── Checkpoint.java │ │ │ │ ├── FlinkConf.java │ │ │ │ └── Streaming.java │ │ │ ├── bean/ │ │ │ │ ├── CheckpointParams.java │ │ │ │ ├── DistributeBean.java │ │ │ │ └── FlinkTableSchema.java │ │ │ ├── enu/ │ │ │ │ └── DistributeModule.java │ │ │ ├── ext/ │ │ │ │ └── watermark/ │ │ │ │ └── FirePeriodicWatermarks.java │ │ │ ├── sink/ │ │ │ │ ├── BaseSink.scala │ │ │ │ ├── HBaseSink.scala │ │ │ │ └── JdbcSink.scala │ │ │ └── task/ │ │ │ └── FlinkSchedulerManager.java │ │ ├── resources/ │ │ │ ├── META-INF/ │ │ │ │ └── services/ │ │ │ │ └── org.apache.flink.table.factories.Factory │ │ │ ├── flink-batch.properties │ │ │ ├── flink-streaming.properties │ │ │ └── flink.properties │ │ ├── scala/ │ │ │ └── com/ │ │ │ └── zto/ │ │ │ ├── fire/ │ │ │ │ └── flink/ │ │ │ │ ├── BaseFlink.scala │ │ │ │ ├── BaseFlinkBatch.scala │ │ │ │ ├── BaseFlinkCore.scala │ │ │ │ ├── BaseFlinkStreaming.scala │ │ │ │ ├── FlinkBatch.scala │ │ │ │ ├── FlinkCore.scala │ │ │ │ ├── FlinkStreaming.scala │ │ │ │ ├── acc/ │ │ │ │ │ └── MultiCounterAccumulator.scala │ │ │ │ ├── conf/ │ │ │ │ │ ├── FireFlinkConf.scala │ │ │ │ │ └── FlinkAnnoManager.scala │ │ │ │ ├── ext/ │ │ │ │ │ ├── batch/ │ │ │ │ │ │ ├── BatchExecutionEnvExt.scala │ │ │ │ │ │ ├── BatchTableEnvExt.scala │ │ │ │ │ │ └── DataSetExt.scala │ │ │ │ │ ├── function/ │ │ │ │ │ │ ├── RichFunctionExt.scala │ │ │ │ │ │ └── RuntimeContextExt.scala │ │ │ │ │ ├── provider/ │ │ │ │ │ │ ├── HBaseConnectorProvider.scala │ │ │ │ │ │ └── JdbcFlinkProvider.scala │ │ │ │ │ └── stream/ │ │ │ │ │ ├── DataStreamExt.scala │ │ │ │ │ ├── KeyedStreamExt.scala │ │ │ │ │ ├── RowExt.scala │ │ │ │ │ ├── SQLExt.scala │ │ │ │ │ ├── StreamExecutionEnvExt.scala │ │ │ │ │ ├── TableEnvExt.scala │ │ │ │ │ ├── TableExt.scala │ │ │ │ │ └── TableResultImplExt.scala │ │ │ │ ├── plugin/ │ │ │ │ │ └── FlinkArthasLauncher.scala │ │ │ │ ├── rest/ │ │ │ │ │ └── FlinkSystemRestful.scala │ │ │ │ ├── sql/ │ │ │ │ │ ├── FlinkSqlExtensionsParser.scala │ │ │ │ │ └── FlinkSqlParserBase.scala │ │ │ │ ├── sync/ │ │ │ │ │ ├── DistributeSyncManager.scala │ │ │ │ │ ├── FlinkLineageAccumulatorManager.scala │ │ │ │ │ └── SyncFlinkEngine.scala │ │ │ │ ├── task/ │ │ │ │ │ └── FlinkInternalTask.scala │ │ │ │ └── util/ │ │ │ │ ├── FlinkSingletonFactory.scala │ │ │ │ ├── FlinkUtils.scala │ │ │ │ ├── HivePartitionTimeExtractor.scala │ │ │ │ ├── RocketMQUtils.scala │ │ │ │ └── StateCleanerUtils.scala │ │ │ └── fire.scala │ │ ├── scala-flink-1.12/ │ │ │ └── com/ │ │ │ └── zto/ │ │ │ └── fire/ │ │ │ └── flink/ │ │ │ └── sql/ │ │ │ └── FlinkSqlParser.scala │ │ ├── scala-flink-1.13/ │ │ │ └── com/ │ │ │ └── zto/ │ │ │ └── fire/ │ │ │ └── flink/ │ │ │ └── sql/ │ │ │ └── FlinkSqlParser.scala │ │ └── scala-flink-1.14/ │ │ └── com/ │ │ └── zto/ │ │ └── fire/ │ │ └── flink/ │ │ └── sql/ │ │ └── FlinkSqlParser.scala │ ├── fire-spark/ │ │ ├── pom.xml │ │ └── src/ │ │ └── main/ │ │ ├── java/ │ │ │ └── com/ │ │ │ └── zto/ │ │ │ └── fire/ │ │ │ └── spark/ │ │ │ ├── anno/ │ │ │ │ ├── SparkConf.java │ │ │ │ ├── Streaming.java │ │ │ │ └── StreamingDuration.java │ │ │ ├── bean/ │ │ │ │ ├── ColumnMeta.java │ │ │ │ ├── FunctionMeta.java │ │ │ │ ├── GenerateBean.java │ │ │ │ ├── RestartParams.java │ │ │ │ ├── SparkInfo.java │ │ │ │ └── TableMeta.java │ │ │ └── task/ │ │ │ └── SparkSchedulerManager.java │ │ ├── resources/ │ │ │ ├── spark-core.properties │ │ │ ├── spark-streaming.properties │ │ │ ├── spark.properties │ │ │ └── structured-streaming.properties │ │ ├── scala/ │ │ │ └── com/ │ │ │ └── zto/ │ │ │ ├── fire/ │ │ │ │ └── spark/ │ │ │ │ ├── BaseSpark.scala │ │ │ │ ├── BaseSparkBatch.scala │ │ │ │ ├── BaseSparkCore.scala │ │ │ │ ├── BaseSparkStreaming.scala │ │ │ │ ├── BaseStructuredStreaming.scala │ │ │ │ ├── SparkBatch.scala │ │ │ │ ├── SparkCore.scala │ │ │ │ ├── SparkStreaming.scala │ │ │ │ ├── StructuredStreaming.scala │ │ │ │ ├── acc/ │ │ │ │ │ ├── AccumulatorManager.scala │ │ │ │ │ ├── EnvironmentAccumulator.scala │ │ │ │ │ ├── LineageAccumulator.scala │ │ │ │ │ ├── LogAccumulator.scala │ │ │ │ │ ├── MultiCounterAccumulator.scala │ │ │ │ │ ├── MultiTimerAccumulator.scala │ │ │ │ │ ├── StringAccumulator.scala │ │ │ │ │ └── SyncAccumulator.scala │ │ │ │ ├── conf/ │ │ │ │ │ ├── FireSparkConf.scala │ │ │ │ │ └── SparkAnnoManager.scala │ │ │ │ ├── connector/ │ │ │ │ │ ├── BeanGenReceiver.scala │ │ │ │ │ ├── DataGenReceiver.scala │ │ │ │ │ ├── HBaseBulkConnector.scala │ │ │ │ │ ├── HBaseBulkFunctions.scala │ │ │ │ │ └── HBaseSparkBridge.scala │ │ │ │ ├── ext/ │ │ │ │ │ ├── core/ │ │ │ │ │ │ ├── DStreamExt.scala │ │ │ │ │ │ ├── DataFrameExt.scala │ │ │ │ │ │ ├── DatasetExt.scala │ │ │ │ │ │ ├── RDDExt.scala │ │ │ │ │ │ ├── SQLContextExt.scala │ │ │ │ │ │ ├── SparkConfExt.scala │ │ │ │ │ │ ├── SparkContextExt.scala │ │ │ │ │ │ ├── SparkSessionExt.scala │ │ │ │ │ │ └── StreamingContextExt.scala │ │ │ │ │ └── provider/ │ │ │ │ │ ├── HBaseBulkProvider.scala │ │ │ │ │ ├── HBaseConnectorProvider.scala │ │ │ │ │ ├── HBaseHadoopProvider.scala │ │ │ │ │ ├── JdbcSparkProvider.scala │ │ │ │ │ ├── KafkaSparkProvider.scala │ │ │ │ │ ├── SparkProvider.scala │ │ │ │ │ └── SqlProvider.scala │ │ │ │ ├── listener/ │ │ │ │ │ ├── FireSparkListener.scala │ │ │ │ │ └── FireStreamingQueryListener.scala │ │ │ │ ├── plugin/ │ │ │ │ │ └── SparkArthasLauncher.scala │ │ │ │ ├── rest/ │ │ │ │ │ └── SparkSystemRestful.scala │ │ │ │ ├── sink/ │ │ │ │ │ ├── FireSink.scala │ │ │ │ │ └── JdbcStreamSink.scala │ │ │ │ ├── sql/ │ │ │ │ │ ├── SparkSqlExtensionsParserBase.scala │ │ │ │ │ ├── SparkSqlParserBase.scala │ │ │ │ │ └── SqlExtensions.scala │ │ │ │ ├── sync/ │ │ │ │ │ ├── DistributeSyncManager.scala │ │ │ │ │ ├── SparkLineageAccumulatorManager.scala │ │ │ │ │ └── SyncSparkEngine.scala │ │ │ │ ├── task/ │ │ │ │ │ └── SparkInternalTask.scala │ │ │ │ ├── udf/ │ │ │ │ │ └── UDFs.scala │ │ │ │ └── util/ │ │ │ │ ├── RocketMQUtils.scala │ │ │ │ ├── SparkSingletonFactory.scala │ │ │ │ └── SparkUtils.scala │ │ │ └── fire.scala │ │ ├── scala-spark-2.3/ │ │ │ └── com.zto.fire.spark.sql/ │ │ │ ├── SparkSqlExtensionsParser.scala │ │ │ └── SparkSqlParser.scala │ │ ├── scala-spark-2.4/ │ │ │ └── com.zto.fire.spark.sql/ │ │ │ ├── SparkSqlExtensionsParser.scala │ │ │ └── SparkSqlParser.scala │ │ ├── scala-spark-3.0/ │ │ │ └── com/ │ │ │ └── zto/ │ │ │ └── fire/ │ │ │ └── spark/ │ │ │ └── sql/ │ │ │ ├── SparkSqlExtensionsParser.scala │ │ │ └── SparkSqlParser.scala │ │ ├── scala-spark-3.1/ │ │ │ └── com/ │ │ │ └── zto/ │ │ │ └── fire/ │ │ │ └── spark/ │ │ │ └── sql/ │ │ │ ├── SparkSqlExtensionsParser.scala │ │ │ └── SparkSqlParser.scala │ │ ├── scala-spark-3.2/ │ │ │ └── com/ │ │ │ └── zto/ │ │ │ └── fire/ │ │ │ └── spark/ │ │ │ └── sql/ │ │ │ ├── SparkSqlExtensionsParser.scala │ │ │ └── SparkSqlParser.scala │ │ └── scala-spark-3.3/ │ │ └── com/ │ │ └── zto/ │ │ └── fire/ │ │ └── spark/ │ │ └── sql/ │ │ ├── SparkSqlExtensionsParser.scala │ │ └── SparkSqlParser.scala │ └── pom.xml ├── fire-enhance/ │ ├── apache-arthas/ │ │ ├── pom.xml │ │ └── src/ │ │ └── main/ │ │ └── java/ │ │ └── com/ │ │ └── taobao/ │ │ └── arthas/ │ │ └── agent/ │ │ └── attach/ │ │ └── ArthasAgent.java │ ├── apache-flink/ │ │ ├── pom.xml │ │ └── src/ │ │ └── main/ │ │ ├── java-flink-1.12/ │ │ │ └── org/ │ │ │ ├── apache/ │ │ │ │ └── flink/ │ │ │ │ ├── client/ │ │ │ │ │ └── deployment/ │ │ │ │ │ └── application/ │ │ │ │ │ └── ApplicationDispatcherBootstrap.java │ │ │ │ ├── configuration/ │ │ │ │ │ └── GlobalConfiguration.java │ │ │ │ ├── contrib/ │ │ │ │ │ └── streaming/ │ │ │ │ │ └── state/ │ │ │ │ │ ├── RocksDBStateBackend.java │ │ │ │ │ └── restore/ │ │ │ │ │ └── RocksDBFullRestoreOperation.java │ │ │ │ ├── runtime/ │ │ │ │ │ ├── checkpoint/ │ │ │ │ │ │ └── CheckpointCoordinator.java │ │ │ │ │ └── util/ │ │ │ │ │ └── EnvironmentInformation.java │ │ │ │ ├── table/ │ │ │ │ │ └── api/ │ │ │ │ │ └── internal/ │ │ │ │ │ └── TableEnvironmentImpl.java │ │ │ │ └── util/ │ │ │ │ └── ExceptionUtils.java │ │ │ └── rocksdb/ │ │ │ └── RocksDB.java │ │ ├── java-flink-1.13/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── flink/ │ │ │ ├── client/ │ │ │ │ └── deployment/ │ │ │ │ └── application/ │ │ │ │ └── ApplicationDispatcherBootstrap.java │ │ │ ├── configuration/ │ │ │ │ └── GlobalConfiguration.java │ │ │ ├── contrib/ │ │ │ │ └── streaming/ │ │ │ │ └── state/ │ │ │ │ └── EmbeddedRocksDBStateBackend.java │ │ │ ├── runtime/ │ │ │ │ ├── checkpoint/ │ │ │ │ │ └── CheckpointCoordinator.java │ │ │ │ └── util/ │ │ │ │ └── EnvironmentInformation.java │ │ │ ├── table/ │ │ │ │ └── api/ │ │ │ │ └── internal/ │ │ │ │ └── TableEnvironmentImpl.java │ │ │ └── util/ │ │ │ └── ExceptionUtils.java │ │ └── java-flink-1.14/ │ │ └── org/ │ │ ├── apache/ │ │ │ └── flink/ │ │ │ ├── client/ │ │ │ │ └── deployment/ │ │ │ │ └── application/ │ │ │ │ └── ApplicationDispatcherBootstrap.java │ │ │ ├── configuration/ │ │ │ │ └── GlobalConfiguration.java │ │ │ ├── connector/ │ │ │ │ └── jdbc/ │ │ │ │ ├── dialect/ │ │ │ │ │ ├── AdbDialect.java │ │ │ │ │ ├── JdbcDialect.java │ │ │ │ │ ├── JdbcDialects.java │ │ │ │ │ ├── MySQLDialect.java │ │ │ │ │ └── OracleSQLDialect.java │ │ │ │ └── internal/ │ │ │ │ └── converter/ │ │ │ │ └── OracleSQLRowConverter.java │ │ │ ├── contrib/ │ │ │ │ └── streaming/ │ │ │ │ └── state/ │ │ │ │ └── EmbeddedRocksDBStateBackend.java │ │ │ ├── runtime/ │ │ │ │ ├── checkpoint/ │ │ │ │ │ └── CheckpointCoordinator.java │ │ │ │ └── util/ │ │ │ │ └── EnvironmentInformation.java │ │ │ ├── streaming/ │ │ │ │ └── connectors/ │ │ │ │ └── kafka/ │ │ │ │ ├── FlinkKafkaConsumer.java │ │ │ │ └── FlinkKafkaConsumerBase.java │ │ │ ├── table/ │ │ │ │ └── api/ │ │ │ │ └── internal/ │ │ │ │ └── TableEnvironmentImpl.java │ │ │ └── util/ │ │ │ └── ExceptionUtils.java │ │ └── rocksdb/ │ │ └── RocksDB.java │ ├── apache-spark/ │ │ ├── pom.xml │ │ └── src/ │ │ └── main/ │ │ └── scala-spark-3.0/ │ │ └── org/ │ │ └── apache/ │ │ └── spark/ │ │ ├── internal/ │ │ │ └── config/ │ │ │ └── Streaming.scala │ │ ├── sql/ │ │ │ └── execution/ │ │ │ └── datasources/ │ │ │ └── InsertIntoHadoopFsRelationCommand.scala │ │ └── streaming/ │ │ └── scheduler/ │ │ └── ExecutorAllocationManager.scala │ └── pom.xml ├── fire-examples/ │ ├── flink-examples/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ ├── java/ │ │ │ │ └── com/ │ │ │ │ └── zto/ │ │ │ │ └── fire/ │ │ │ │ ├── examples/ │ │ │ │ │ └── bean/ │ │ │ │ │ ├── People.java │ │ │ │ │ └── Student.java │ │ │ │ └── sql/ │ │ │ │ └── SqlCommandParser.java │ │ │ ├── resources/ │ │ │ │ ├── META-INF/ │ │ │ │ │ └── services/ │ │ │ │ │ └── org.apache.flink.table.factories.Factory │ │ │ │ ├── common.properties │ │ │ │ ├── connector/ │ │ │ │ │ └── hive/ │ │ │ │ │ └── HiveSinkTest.properties │ │ │ │ ├── log4j.properties │ │ │ │ └── stream/ │ │ │ │ └── ConfigCenterTest.properties │ │ │ └── scala/ │ │ │ └── com/ │ │ │ └── zto/ │ │ │ └── fire/ │ │ │ └── examples/ │ │ │ └── flink/ │ │ │ ├── FlinkDemo.scala │ │ │ ├── FlinkSQLDemo.scala │ │ │ ├── Test.scala │ │ │ ├── acc/ │ │ │ │ └── FlinkAccTest.scala │ │ │ ├── batch/ │ │ │ │ ├── FireMapFunctionTest.scala │ │ │ │ ├── FlinkBatchTest.scala │ │ │ │ └── FlinkBrocastTest.scala │ │ │ ├── connector/ │ │ │ │ ├── FlinkHudiTest.scala │ │ │ │ ├── bean/ │ │ │ │ │ ├── BeanConnectorTest.scala │ │ │ │ │ ├── BeanDynamicTableFactory.scala │ │ │ │ │ ├── BeanDynamicTableSink.scala │ │ │ │ │ ├── BeanDynamicTableSource.scala │ │ │ │ │ └── BeanOptions.scala │ │ │ │ ├── clickhouse/ │ │ │ │ │ └── ClickhouseTest.scala │ │ │ │ ├── hive/ │ │ │ │ │ ├── HiveBatchSinkTest.scala │ │ │ │ │ └── HiveSinkTest.scala │ │ │ │ ├── kafka/ │ │ │ │ │ └── KafkaConsumer.scala │ │ │ │ ├── rocketmq/ │ │ │ │ │ ├── RocketMQConnectorTest.scala │ │ │ │ │ └── RocketTest.scala │ │ │ │ └── sql/ │ │ │ │ ├── DDL.scala │ │ │ │ └── DataGenTest.scala │ │ │ ├── lineage/ │ │ │ │ ├── FlinkSqlLineageTest.scala │ │ │ │ └── LineageTest.scala │ │ │ ├── module/ │ │ │ │ ├── ArthasTest.scala │ │ │ │ └── ExceptionTest.scala │ │ │ ├── sql/ │ │ │ │ ├── HiveDimDemo.scala │ │ │ │ ├── HiveWriteDemo.scala │ │ │ │ ├── JdbcDimDemo.scala │ │ │ │ ├── RocketMQConnectorTest.scala │ │ │ │ ├── SimpleSqlDemo.scala │ │ │ │ └── SqlJoinDemo.scala │ │ │ ├── stream/ │ │ │ │ ├── ConfigCenterTest.scala │ │ │ │ ├── FlinkHiveTest.scala │ │ │ │ ├── FlinkPartitioner.scala │ │ │ │ ├── FlinkRetractStreamTest.scala │ │ │ │ ├── FlinkSinkHiveTest.scala │ │ │ │ ├── FlinkSinkTest.scala │ │ │ │ ├── FlinkSourceTest.scala │ │ │ │ ├── FlinkStateTest.scala │ │ │ │ ├── HBaseTest.scala │ │ │ │ ├── HiveRW.scala │ │ │ │ ├── JdbcTest.scala │ │ │ │ ├── UDFTest.scala │ │ │ │ ├── WatermarkTest.scala │ │ │ │ └── WindowTest.scala │ │ │ └── util/ │ │ │ └── StateCleaner.scala │ │ └── test/ │ │ └── scala/ │ │ └── com/ │ │ └── zto/ │ │ └── fire/ │ │ └── examples/ │ │ └── flink/ │ │ ├── anno/ │ │ │ └── AnnoConfTest.scala │ │ ├── core/ │ │ │ └── BaseFlinkTester.scala │ │ └── jdbc/ │ │ └── JdbcUnitTest.scala │ ├── pom.xml │ └── spark-examples/ │ ├── pom.xml │ └── src/ │ ├── main/ │ │ ├── java/ │ │ │ └── com/ │ │ │ └── zto/ │ │ │ └── fire/ │ │ │ └── examples/ │ │ │ └── bean/ │ │ │ ├── Hudi.java │ │ │ ├── Student.java │ │ │ └── StudentMulti.java │ │ ├── resources/ │ │ │ ├── common.properties │ │ │ ├── jdbc/ │ │ │ │ └── JdbcTest.properties │ │ │ └── streaming/ │ │ │ └── ConfigCenterTest.properties │ │ └── scala/ │ │ └── com/ │ │ └── zto/ │ │ └── fire/ │ │ └── examples/ │ │ └── spark/ │ │ ├── SparkDemo.scala │ │ ├── SparkSQLDemo.scala │ │ ├── Test.scala │ │ ├── acc/ │ │ │ └── FireAccTest.scala │ │ ├── hbase/ │ │ │ ├── HBaseConnectorTest.scala │ │ │ ├── HBaseHadoopTest.scala │ │ │ ├── HBaseStreamingTest.scala │ │ │ ├── HbaseBulkTest.scala │ │ │ └── HiveQL.scala │ │ ├── hive/ │ │ │ ├── HiveClusterReader.scala │ │ │ ├── HiveMetadataTest.scala │ │ │ └── HiveRW.scala │ │ ├── jdbc/ │ │ │ ├── JdbcStreamingTest.scala │ │ │ └── JdbcTest.scala │ │ ├── lineage/ │ │ │ ├── DataSourceTest.scala │ │ │ ├── LineageTest.scala │ │ │ └── SparkCoreLineageTest.scala │ │ ├── module/ │ │ │ ├── ArthasTest.scala │ │ │ └── ExceptionTest.scala │ │ ├── schedule/ │ │ │ ├── ScheduleTest.scala │ │ │ └── Tasks.scala │ │ ├── sql/ │ │ │ ├── LoadTestSQL.scala │ │ │ └── SparkSqlParseTest.scala │ │ ├── streaming/ │ │ │ ├── AtLeastOnceTest.scala │ │ │ ├── ConfigCenterTest.scala │ │ │ ├── DataGenTest.scala │ │ │ ├── KafkaTest.scala │ │ │ └── RocketTest.scala │ │ ├── structured/ │ │ │ ├── JdbcSinkTest.scala │ │ │ ├── MapTest.scala │ │ │ └── StructuredStreamingTest.scala │ │ └── thread/ │ │ └── ThreadTest.scala │ └── test/ │ ├── resources/ │ │ ├── ConfigCenterUnitTest.properties │ │ ├── SparkSQLParserTest.properties │ │ └── common.properties │ ├── scala/ │ │ └── com/ │ │ └── zto/ │ │ └── fire/ │ │ └── examples/ │ │ └── spark/ │ │ ├── anno/ │ │ │ └── AnnoConfTest.scala │ │ ├── conf/ │ │ │ └── ConfigCenterUnitTest.scala │ │ ├── core/ │ │ │ └── BaseSparkTester.scala │ │ ├── hbase/ │ │ │ ├── HBaseApiTest.scala │ │ │ ├── HBaseBaseTester.scala │ │ │ ├── HBaseBulkUnitTest.scala │ │ │ ├── HBaseConnectorUnitTest.scala │ │ │ └── HBaseHadoopUnitTest.scala │ │ ├── hive/ │ │ │ └── HiveUnitTest.scala │ │ ├── jdbc/ │ │ │ ├── JdbcConnectorTest.scala │ │ │ └── JdbcUnitTest.scala │ │ └── parser/ │ │ └── SparkSQLParserTest.scala │ └── scala-spark-3.0/ │ └── com/ │ └── zto/ │ └── fire/ │ └── examples/ │ └── spark/ │ └── sql/ │ └── SparkSqlParseTest.scala ├── fire-external/ │ ├── .gitignore │ ├── fire-apollo/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ ├── resources/ │ │ │ │ └── apollo.properties │ │ │ └── scala/ │ │ │ └── com/ │ │ │ └── zto/ │ │ │ └── fire/ │ │ │ └── apollo/ │ │ │ └── util/ │ │ │ ├── ApolloConfigUtil.scala │ │ │ └── ApolloConstant.scala │ │ └── test/ │ │ └── scala/ │ │ └── com/ │ │ └── zto/ │ │ └── fire/ │ │ └── apollo/ │ │ └── util/ │ │ └── ApolloConfigUtilTest.scala │ └── pom.xml ├── fire-metrics/ │ ├── pom.xml │ └── src/ │ ├── main/ │ │ └── java/ │ │ └── com/ │ │ └── zto/ │ │ └── fire/ │ │ └── metrics/ │ │ └── MetricsDemo.scala │ └── test/ │ ├── java/ │ │ └── com/ │ │ └── zto/ │ │ └── fire/ │ │ └── jmx/ │ │ ├── Hello.java │ │ ├── HelloMBean.java │ │ ├── JmxApp.java │ │ ├── QueueSample.java │ │ ├── QueueSampler.java │ │ └── QueueSamplerMXBean.java │ └── scala/ │ └── com.zto.fire.metrics/ │ └── MetricsTest.scala ├── fire-platform/ │ └── pom.xml ├── fire-shell/ │ ├── flink-shell/ │ │ ├── pom.xml │ │ └── src/ │ │ └── main/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── flink/ │ │ │ └── api/ │ │ │ └── java/ │ │ │ ├── JarHelper.java │ │ │ ├── ScalaShellEnvironment.java │ │ │ └── ScalaShellStreamEnvironment.java │ │ ├── java-flink-1.12/ │ │ │ └── org.apache.flink.streaming.api.environment/ │ │ │ └── StreamExecutionEnvironment.java │ │ ├── java-flink-1.13/ │ │ │ └── org.apache.flink.streaming.api.environment/ │ │ │ └── StreamExecutionEnvironment.java │ │ └── scala/ │ │ ├── com/ │ │ │ └── zto/ │ │ │ └── fire/ │ │ │ └── shell/ │ │ │ └── flink/ │ │ │ ├── FireILoop.scala │ │ │ └── Test.scala │ │ └── org/ │ │ └── apache/ │ │ └── flink/ │ │ └── api/ │ │ └── scala/ │ │ └── FlinkShell.scala │ ├── pom.xml │ └── spark-shell/ │ ├── pom.xml │ └── src/ │ └── main/ │ └── scala-spark-3.0/ │ ├── com/ │ │ └── zto/ │ │ └── fire/ │ │ └── shell/ │ │ └── spark/ │ │ ├── FireILoop.scala │ │ ├── Main.scala │ │ └── Test.scala │ └── org/ │ └── apache/ │ └── spark/ │ └── repl/ │ ├── ExecutorClassLoader.scala │ └── Signaling.scala └── pom.xml ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ .idea/* fire-parent.iml *.iml target/ *.log ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # Fire框架   Fire框架是由**中通大数据**自主研发并开源的、专门用于进行**Spark**和**Flink**任务开发的大数据框架。该框架屏蔽技术细节,提供大量简易API帮助开发者更快的构建实时计算任务。同时Fire框架也内置了平台化的功能,用于与实时平台集成。基于Fire框架的任务在中通每天处理的数据量高达**几千亿以上**,覆盖了**Spark计算**(离线&实时)、**Flink计算**等众多计算场景。 ## 一、就这么简单! ### 1.1 Flink开发示例 ```scala @Config( """ |state.checkpoints.num-retained=30 # 支持任意Flink调优参数、Fire框架参数、用户自定义参数等 |state.checkpoints.dir=hdfs:///user/flink/checkpoint |""") @Hive("thrift://localhost:9083") // 配置连接到指定的hive @Streaming(interval = 100, unaligned = true) // 100s做一次checkpoint,开启非对齐checkpoint @Kafka(brokers = "localhost:9092", topics = "fire", groupId = "fire") object FlinkDemo extends FlinkStreaming { @Process def kafkaSource: Unit = { val dstream = this.fire.createKafkaDirectStream() // 使用api的方式消费kafka sql("""create table statement ...""") sql("""insert into statement ...""") } } ``` ### 1.2 Spark开发示例 ```scala @Config( """ |spark.shuffle.compress=true # 支持任意Spark调优参数、Fire框架参数、用户自定义参数等 |spark.ui.enabled=true |""") @Hive("thrift://localhost:9083") // 配置连接到指定的hive @Streaming(interval = 100, maxRatePerPartition = 100) // 100s一个Streaming batch,并限制消费速率 @Kafka(brokers = "localhost:9092", topics = "fire", groupId = "fire") object SparkDemo extends SparkStreaming { @Process def kafkaSource: Unit = { val dstream = this.fire.createKafkaDirectStream() // 使用api的方式消费kafka sql("""select * from xxx""").show() } } ``` ***说明:structured streaming、spark core、flink sql、flink批任务均支持,代码结构与上述示例一致。*** ## *[二、开发文档](./docs/index.md)* ## 三、亮点多多! ### 3.1 兼容主流版本   fire框架适配了不同的spark与flink版本,支持spark2.x及以上所有版本,flink1.10及以上所有版本,支持基于scala2.11或scala2.12进行编译。 ```shell # 可根据实际需要选择不同的引擎版本进行fire框架的构建 mvn clean install -DskipTests -Pspark-3.0.2 -Pflink-1.14.3 -Pscala-2.12 ``` | Apache Spark | Apache Flink | | ------------ | ------------ | | 2.3.x | 1.10.x | | 2.4.x | 1.11.x | | 3.0.x | 1.12.x | | 3.1.x | 1.13.x | | 3.2.x | 1.14.x | | 3.3.x | 1.15.x | ### **3.2 简单好用**   Fire框架高度封装,屏蔽大量技术细节,许多connector仅需一行代码即可完成主要功能。同时Fire框架统一了spark与flink两大引擎常用的api,使用统一的代码风格即可实现spark与flink的代码开发。 - **HBase API** ```scala // 读取HBase中指定rowkey数据并将结果集封装为DataFrame返回 val studentDF: DataFrame = this.fire.hbaseGetDF(hTableName, classOf[Student], getRDD) // 将指定数据集分布式插入到指定HBase表中 this.fire.hbasePutDF(hTableName, studentDF, classOf[Student]) ``` - **JDBC API** ```scala // 将DataFrame中指定几列插入到关系型数据库中,每100条一插入 df.jdbcBatchUpdate(insertSql, Seq("name", "age", "createTime", "length", "sex"), batch = 100) // 将查询结果通过反射映射到DataFrame中 val df: DataFrame = this.fire.jdbcQueryDF(querySql, Seq(1, 2, 3), classOf[Student]) ``` ### **3.3 灵活的配置方式**   支持基于接口、apollo、配置文件以及注解等多种方式配置,支持将spark&flink等**引擎参数**、**fire框架参数**以及**用户自定义参数**混合配置,支持运行时动态修改配置。几种常用配置方式如下([配置手册](./docs/config.md)): 1. **基于配置文件:** 创建类名同名的properties文件进行参数配置 2. **基于接口配置:** fire框架提供了配置接口调用,通过接口获取所需的配置,可用于平台化的配置管理 3. **基于注解配置:** 通过注解的方式实现集群环境、connector、调优参数的配置,常用注解如下: ```scala @Config( """ |# 支持Flink调优参数、Fire框架参数、用户自定义参数等 |state.checkpoints.num-retained=30 |state.checkpoints.dir=hdfs:///user/flink/checkpoint |""") @Hive("thrift://localhost:9083") @Checkpoint(interval = 100, unaligned = true) @Kafka(brokers = "localhost:9092", topics = "fire", groupId = "fire") @RocketMQ(brokers = "bigdata_test", topics = "fire", groupId = "fire", tag = "*", startingOffset = "latest") @Jdbc(url = "jdbc:mysql://mysql-server:3306/fire", username = "root", password = "..root726") @HBase("localhost:2181") ``` **配置获取:**   Fire框架封装了统一的配置获取api,基于该api,无论是spark还是flink,无论是在Driver | JobManager端还是在Executor | TaskManager端,都可以一行代码获取所需配置。这套配置获取api,无需再在flink的map等算子中复写open方法了,用起来十分方便。 ```scala this.conf.getString("my.conf") this.conf.getInt("state.checkpoints.num-retained") ... ``` ### **3.4 多集群支持**   Fire框架的配置支持N多集群,比如同一个任务中可以同时配置多个HBase、Kafka数据源,使用不同的数值后缀即可区分(**keyNum**): ```scala // 假设基于注解配置HBase多集群如下: @HBase("localhost:2181") @HBase2(cluster = "192.168.0.1:2181", storageLevel = "DISK_ONLY") // 代码中使用对应的数值后缀进行区分 this.fire.hbasePutDF(hTableName, studentDF, classOf[Student]) // 默认keyNum=1,表示使用@HBase注解配置的集群信息 this.fire.hbasePutDF(hTableName2, studentDF, classOf[Student], keyNum=2) // keyNum=2,表示使用@HBase2注解配置的集群信息 ``` ### **3.5 常用connector支持**   支持kafka、rocketmq、redis、HBase、Jdbc、clickhouse、Hive、hudi、tidb、adb等常见的connector。 ### **3.6 [checkpoint热修改](./docs/highlight/checkpoint.md)**   支持运行时动态调整checkpoint周期、超时时间、并行checkpoint等参数,避免任务重启时由于反压带来的checkpoint压力。 ### **3.7 [streaming热重启](./docs/highlight/spark-duration.md)**   该功能是主要用于Spark Streaming任务,通过热重启技术,可以在不重启Spark Streaming的前提下,实现批次时间的热修改。比如在web端将某个任务的批次时间调整为10s,会立即生效。 ### **3.8 配置热更新**   用户仅需在web页面中更新指定的配置信息,就可以让实时任务接收到最新的配置并且立即生效。最典型的应用场景是进行Spark任务的某个算子partition数调整,比如当任务处理的数据量较大时,可以通过该功能将repartition的具体分区数调大,会立即生效。 ### **3.9 在线性能诊断**   深度集成Arthas,可对运行中的任务动态进行性能诊断。fire为arthas诊断提供rest接口,可通过接口调用的方式选择为driver、jobmanager或executor、taskmanager动态开启与关闭arthas诊断线程,然后向统一的arthas tunnel服务注册,即可在网页端输入arthas命令进行性能诊断。 ![arthas-shell](docs/img/arthas-shell.png) ### **3.10 sql在线调试**   Fire框架对外暴露了restful接口,平台等系统可通过接口调用的方式将待执行的sql语句动态传递给fire,由fire将sql提交到对应的引擎,并将sql执行结果通过接口调用的方式返回,实现实时任务sql开发的在线调试,避免重复修改代码发布执行带来的时间成本。 ### **3.11 实时血缘**   Fire框架支持运行时统计分析每个任务所使用到的数据源信息、库表信息、操作类型等,并将这些血缘信息通过接口的方式对外暴露。实时平台等web系统通过接口调用的方式即可获取到实时血缘信息。 ### **3.12 定时调度**   Fire框架内部封装了quartz框架,实现通过Scheduled注解即可完成定时任务的注册。 ```scala /** * 声明了@Scheduled注解的方法是定时任务方法,会周期性执行 * * @scope 默认同时在driver端和executor端执行,如果指定了driver,则只在driver端定时执行 * @initialDelay 延迟多长时间开始执行第一次定时任务 */ @Scheduled(cron = "0/5 * * * * ?", scope = "driver", initialDelay = 60000) def loadTable: Unit = { this.logger.info("更新维表动作") } ``` ### **3.13 平台无缝集成**   Fire框架内置restful服务,并将许多功能通过接口的方式对外暴露,实时平台可以通过fire框架暴露的接口实现与每个实时任务的信息连接。 ### **3.14 fire-shell**   Fire框架整合spark shell与flink shell,支持通过REPL方式去动态调试spark和flink任务,并且支持fire框架的所有API。fire框架将shell能力通过接口方式暴露给实时平台,如此一来就可以通过web页面去调试spark和flink任务了。 ## *[四、升级日志](./docs/feature.md)* ## 五、期待你的加入 **技术交流(钉钉群):*35373471*** ================================================ FILE: docs/accumulator.md ================================================ # 累加器 Fire框架针对spark和flink的累加器进行了深度的定制,该api具有不需要事先声明累加器变量,可到处使用等优点。[示例代码](../fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/acc/FireAccTest.scala) ### 一、累加器的基本使用 ```scala // 消息接入 val dstream = this.fire.createKafkaDirectStream() dstream.foreachRDD(rdd => { rdd.coalesce(this.conf.getInt(key, 10)).foreachPartition(t => { // 单值累加器 this.acc.addCounter(1) // 多值累加器,根据key的不同分别进行数据的累加,以下两行代码表示分别对multiCounter // 和partitions这两个累加器进行累加 this.acc.addMultiCounter("multiCounter", 1) this.acc.addMultiCounter("partitions", 1) // 多时间维度累加器,比多值累加器多了一个时间维度, // 如:hbaseWriter 2019-09-10 11:00:00 10 // 如:hbaseWriter 2019-09-10 11:01:00 21 this.acc.addMultiTimer("multiTimer", 1) }) }) ``` ### 二、累加器类型 1. 单值累加器 单值累加器的特点是:只会将数据累加到同一个累加器中,全局唯一。 2. 多值累加器 多值累加器的特点是:不同累加器实例使用不同的字符串key作为区分,相同的key的进行统一的累加,比单值累加器更强大。 3. 时间维度累加器 时间维度累加器是在多值累加器的基础上进行了进一步的增强,引入了时间维度的概念。它以时间和累加器标识作为联合的累加器key。比如key为hbase_sink,那么统计的数据默认是按分钟进行,下一分钟是一个全新的累加窗口。时间维度累加器可以通过参数修改时间戳的格式,比如按分钟、小时、天、月、年等。 ```scala // 多时间维度累加器,比多值累加器多了一个时间维度, // 如:hbaseWriter 2019-09-10 11:00:00 10 // 如:hbaseWriter 2019-09-10 11:01:00 21 this.acc.addMultiTimer("multiTimer", 1) // 指定时间戳,以小时作为统计窗口进行累加 this.acc.addMultiTimer("multiTimer", 1, schema = "YYYY-MM-dd HH") ``` ### 三、累加器值的获取 1. 程序中获取 ```scala /** * 获取累加器中的值 */ @Scheduled(fixedInterval = 60 * 1000) def printAcc: Unit = { this.acc.getMultiTimer.cellSet().foreach(t => println(s"key:" + t.getRowKey + " 时间:" + t.getColumnKey + " " + t.getValue + "条")) println("单值:" + this.acc.getCounter) this.acc.getMultiCounter.foreach(t => { println("多值:key=" + t._1 + " value=" + t._2) }) val size = this.acc.getMultiTimer.cellSet().size() println(s"===multiTimer.size=${size}==log.size=${this.acc.getLog.size()}===") } ``` 2. 平台接口获取 Fire框架针对累加器的获取提供了单独的接口,平台可以通过接口调用方式实时获取累加器的最新统计结果。 | 接口地址 | 接口用途 | | -------------------- | -------------------------------- | | /system/counter | 用于获取累加器的值。 | | /system/multiCounter | 用于获取多值累加器的值。 | | /system/multiTimer | 用于获取时间维度多值累加器的值。 | ================================================ FILE: docs/anno.md ================================================ # Fire框架--基于注解简化Flink和Spark开发   从JDK5开始,Java提供了**注解**新特性,随后,注解如雨后春笋般被大量应用到各种开发框架中,其中,最具代表的是Spring。在注解出现以前,Spring的配置通常需要写到xml中。基于xml配置,有着十分繁琐,难以记忆,容易出错等弊端。Spring开发者也意识到了这个问题,于是开始引入大量的注解,让注解替代传统的xml配置。   到了大数据时代,以Hadoop、Spark、Flink为代表的分布式计算引擎先后横空出世。相信很多从事过Java Web开发工作的人一开始都不太习惯这几款开发框架提供的API,这几款框架也不约而同的放弃注解特性。可能很多大数据开发者都或多或少的设想,将Spring集成到Spark或者Flink的代码工程中,但这么做实际是有很多问题的,因为Spring并不适用于大数据分布式计算场景。   **那么,实时开发的Spring(春天)在哪里呢?就在Fire框架中!**为了最大化降低实时计算开发门槛,节约代码量,提高配置的便捷性,**中通快递大数据团队**自研了**Fire框架**。Fire框架在中通内部深耕多年,目前线上**数千个**任务均是基于Fire框架开发的。基于Fire框架的任务每天处理的数据量高达**几千亿规模**,顺利通过一年又一年的双十一大考。Fire框架**开源免费**,同时支持**Spark**与**Flink**两大主流引擎,有着简洁方便极易入门等优点,只需几分钟,即可学会。 ## 一、快速入门案例 ### 1.1 Flink开发示例 ```scala @Streaming(interval = 100, unaligned = true, parallelism = 4) // 100s做一次checkpoint,开启非对齐checkpoint @Kafka(brokers = "localhost:9092", topics = "fire", groupId = "fire") object FlinkDemo extends FlinkStreaming { @Process def kafkaSource: Unit = { val dstream = this.fire.createKafkaDirectStream() // 使用api的方式消费kafka sql("""create table statement ...""") sql("""insert into statement ...""") } } ``` ### 1.2 Spark开发示例 ```scala @Config( """ |# 支持Spark、Flink引擎调优参数、Fire框架参数、用户自定义参数等 |spark.shuffle.compress=true |spark.ui.enabled=true |""") @Hive("thrift://localhost:9083") // 配置连接到指定的hive @Streaming(interval = 100, maxRatePerPartition = 100) // 100s一个Streaming batch,并限制消费速率 @Kafka(brokers = "localhost:9092", topics = "fire", groupId = "fire") object SparkDemo extends SparkStreaming { @Process def kafkaSource: Unit = { val dstream = this.fire.createKafkaDirectStream() // 使用api的方式消费kafka sql("""select * from xxx""").show() } } ```   通过上述两个代码示例可以看出,Fire框架为Spark和Flink开发提供了统一的编程风格,使用Fire提供的注解以及对应的父类,即可完成框架的集成。示例中的**@Config**注解支持多行配置,支持Spark或Flink的调优参数、Fire框架内置参数以及用户自定义参数。当任务执行起来时,Fire框架会根据这些配置信息,去初始化**SparkSession**或**ExecutionEnvironment**等上下文信息,避免了大量冗余的任务初始化代码。当然,@Config注解不是必须的,如果不指定,Fire框架会以默认最优的参数去初始化引擎上下文对象。**@Process**注解用于标记开发者代码逻辑的入口,标记该注解的方法会被FIre框架自动调用。mian方法也不是必须的,因为它被定义在了父类当中。   **@Streaming**注解同时适用于Spark Streaming以及Flink两大计算引擎,**interval**用于设置Spark Streaming的批次时间,或者是Flink的**checkpoint**间隔时间。**parallelism**用于指定Flink任务的全局并行度,maxRatePerPartition则是用于配置Spark Streaming的消费速率。@Streaming注解还有很多功能,包括开启checkpoint超时时间、是否开启非对齐checkpoint、Spark Streaming允许同时执行的批次数等。   **@Hive**注解用于指定hive metastore的url,适用于Spark和Flink。当指定了@Hive注解时,Fire框架在内部初始化时即可完成Spark与Flink创建hive catalog的动作。无需将hive-site.xml放到resources目录下,也不需要将hive相关的conf信息手动设置到SparkSession或ExecutionEnvironment中。   **@Kafka**和**@RocketMQ**注解用于配置消费消息队列相关的信息,指定好以后,在代码中即可完成一行代码接入: ```scala val dstream = this.fire.createKafkaDirectStream() // 使用api的方式消费kafka val dStream = this.fire.createRocketMqPullStream() // 使用api的方式消费rocketmq ``` ### 1.3 SQL开发示例   Fire框架对纯SQL开发也提供了很简洁的API,在开发中,将SQL语句(多条以分号分割)放到**sql()**方法中,然后在方法上标记**@Step**注解,即可被顺序执行。@Step注解中的中文描述,会被Fire框架自动打印到日志中,便于问题跟踪、异常排查等。当然,@Step注解和sql()方法一样可以应用到纯api开发的代码中,这些都是通用的。 ```scala @Streaming(interval = 60, parallelism = 2) object JdbcDimDemo extends FlinkStreaming { @Step1("数据源定义") def ddl: Unit = { sql( """ |CREATE TABLE t_mysql_dim ( | `id` BIGINT ... |) WITH ( ... |); | |CREATE TABLE t_kafka_fire ( | `id` BIGINT... |) WITH ( ... |) |""".stripMargin) } @Step2("kafka数据与mysql维表关联") def showJoin: Unit = { sql( """ |select | xxx |from t_kafka_fire t1 left join t_mysql_dim t2 on t1.id=t2.id |""".stripMargin).print() } } ``` 上述代码,Fire框架会根据代码中**@Step**注解的顺序,依次执行代码逻辑,并在日志中打印类似于下图的信息: ![anno_log](img/anno_log.png) ## 二、注解含义(Spark与Flink通用) - **@Config:**该注解支持Flink、Spark引擎相关参数、Fire框架参数以及用户自定义参数。对于引擎相关配置信息,会在构建**SparkSession**或Flink **ExecutionEnvironment**时自动设置生效,避免编写大量重复的用于构建引擎上文的代码。 - **@Streaming:**该注解支持Flink的Checkpoint相关参数,包括频率、超时时间等,还可以进行任务并发度的配置。而对于Spark Streaming任务,则用于设置批次时间、是否开启反压,以及反压情况下消费速率等参数。 - **@Kafka:**该注解用于配置任务中使用到的kafka集群信息,以及kafka-client相关调优参数。如果任务中消费多个kafka,可以使用@Kafka2、@Kafka3这种写法。 - **@Hive:**该注解用于指定任务中所使用的hive数仓thrift server地址。支持HDFS HA,支持跨集群读写Hive。 - **@Process:**该注解用于标记用户代码的入口,标记了@Process的方法会被Fire框架自动调起。 - **@HBase**:用于配置HBase相关连接信息,一行代码完成HBase的读写。 ```scala // 假设基于注解配置HBase多集群如下: @HBase("localhost:2181") @HBase2(cluster = "192.168.0.1:2181", storageLevel = "DISK_ONLY") // 代码中使用对应的数值后缀进行区分 this.fire.hbasePutDF(hTableName, studentDF, classOf[Student]) // 默认keyNum=1,表示使用@HBase注解配置的集群信息 this.fire.hbasePutDF(hTableName2, studentDF, classOf[Student], keyNum=2) // keyNum=2,表示使用@HBase2注解配置的集群信息 ``` - **@JDBC**:用于配置jdbc相关信息,Fire框架内部封装了数据库连接池,会自动获取该注解的配置信息。 ```scala @Jdbc(url = "jdbc:derby:memory:fire;create=true", username = "fire", password = "fire") val insertSql = s"INSERT INTO $tableName (name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" this.fire.jdbcUpdate(insertSql, Seq("admin", 12, timestamp, 10.0, 1)) ``` - **@Scheduled**:用法类似于Sping,支持在Spark Streaming或Flink任务中执行周期性任务。 ```scala /** * 声明了@Scheduled注解的方法是定时任务方法,会周期性执行 * * @scope 默认同时在driver端和executor端执行,如果指定了driver,则只在driver端定时执行 * @initialDelay 延迟多长时间开始执行第一次定时任务 */ @Scheduled(cron = "0/5 * * * * ?", scope = "driver", initialDelay = 60000) def loadTable: Unit = { this.logger.info("更新维表动作") } ``` - **@Before:**生命周期注解,用于在Fire框架初始化引擎上下文之前调用。 - **@After:**生命周期注解,用于在Fire退出jvm之前调用,可用于Spark批任务回收数据库连接池等对象。 ## 三、参考文章: - ### [Fire框架--快速的进行Spark与Flink开发](https://zhuanlan.zhihu.com/p/540808612) - ### [Fire框架--Flink Checkpoint运行时动态调优](https://zhuanlan.zhihu.com/p/551394441) - ### [Fire框架--Spark Streaming动态调整批次时间](https://zhuanlan.zhihu.com/p/552848864) - ### [Fire框架--Flink参数调优与参数获取](https://zhuanlan.zhihu.com/p/543184683) - ### [Fire框架--优雅的实现Flink定时任务](https://zhuanlan.zhihu.com/p/541358069) ## 四、Fire框架源码地址 - ### GitHub:https://github.com/ZTO-Express/fire - ### Gitee:https://gitee.com/RS131419/fire ## 五、Fire框架社区交流群 ### **技术交流(钉钉群):*35373471*** ================================================ FILE: docs/connector/adb.md ================================================ ## Flink adb connector *Flink adb connector基于jdbc sql connector改造,使用方法同flink标准的jdbc sql connector,fire框架能根据jdbc url自动识别是mysql还是adb。* ================================================ FILE: docs/connector/clickhouse.md ================================================ ### Flink clickhouse connector #### 一、DDL ```scala this.fire.sql( """ |CREATE TABLE t_user ( | `id` BIGINT, | `name` STRING, | `age` INT, | `sex` STRING, | `score` DECIMAL, | `birthday` TIMESTAMP |) WITH ( | 'connector' = 'clickhouse', | 'url' = 'jdbc:clickhouse://node01:8123,node02:8123,node03:8123', | 'database-name' = 'study', | 'username' = 'fire', | 'password' = 'fire', | 'use-local' = 'true', -- 指定为true,当分布式表写入时写的是本地表 | 'table-name' = 't_student', | 'sink.batch-size' = '10', | 'sink.flush-interval' = '3', | 'sink.max-retries' = '3' |) |""".stripMargin) ``` #### [二、完整示例](../fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/connector/clickhouse/ClickhouseTest.scala) ================================================ FILE: docs/connector/hbase.md ================================================ # HBase 读写   HBase对更新和点查具有很好的支持,在实时计算场景下也是应用十分广泛的。为了进一步简化HBase读写api,提高开发效率,fire框架对HBase API进行了深度封装。目前支持3种读写模式,分别是:Java API、Bulk API以及Spark提供的API。另外,fire框架支持在同一个任务中对任意多个hbase集群同时进行读写。 ## 一、HBase集群配置 ### 1.1 定义别名 建议将hbase集群url信息定义成别名,别名定义放到名为common.properties的配置文件中。别名的好处是一处维护到处生效,方便共用,便于记忆。 ```properties # 定义hbase集群连接信息别名为test,代码中hbase配置简化为:@HBase("test") fire.hbase.cluster.map.test = zk01:2181,zk02:2181,zk03:2181 ``` ### 1.2 基于注解配置 ```scala @HBase("zk01:2181,zk02:2181,zk03:2181") @HBase2(cluster = "test", scanPartitions = 3, storageLevel = "DISK_ONLY") ``` ### 1.3 基于配置文件 ```properties # 方式一:直接指定zkurl hbase.cluster = zkurl # 方式二:事先定义好hbase别名与url的映射,然后通过别名配置,以下配置定义了别名test与url的映射关系 fire.hbase.cluster.map.test = zk01:2181,zk02:2181,zk03:2181 # 通过别名方式引用 hbase.cluster2 = test ``` ## 二、表与JavaBean映射 Fire框架通过Javabean与HBase表建立的关系简化读写api: ```java /** * 对应HBase表的JavaBean * * @author ChengLong 2019-6-20 16:06:16 */ @HConfig(multiVersion = true) public class Student extends HBaseBaseBean { private Long id; private String name; private Integer age; // 多列族情况下需使用family单独指定 private String createTime; // 若JavaBean的字段名称与HBase中的字段名称不一致,需使用value单独指定 // 此时hbase中的列名为length1,而不是length @FieldName(family = "data", value = "length1") private BigDecimal length; private Boolean sex; /** * rowkey的构建 * * @return */ @Override public Student buildRowKey() { this.rowKey = this.id.toString(); return this; } } ```   上述代码中定义了名为Student的Javabean,该Javabean需要继承自HBaseBaseBean,并实现buildRowKey方法,这个方法中需要告诉fire框架,rowKey是如何构建的。   通过以上两步即可实现Javabean与HBase表的关系绑定。对于个性化需求,如果需要以多版本的方式进行读写,则需在类名上添加@HConfig(multiVersion = true)注解。如果Javabean中的列名与HBase中的字段名不一致,可以通过@FieldName(family = "data", value = "length1")进行单独指定,当然,列族也可以通过这个注解指定。如果不知道列族名称,则默认只有一个名为info的列族。 目前暂不支持scala语言的class以及case class,仅支持基本的字段数据类型,不支持嵌套的或者复杂的字段类型。 ## 三、spark任务 ### [1.1 java api](../fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/hbase/HBaseConnectorTest.scala) ```scala /** * 使用HBaseConnector插入一个rdd的数据 * rdd的类型必须为HBaseBaseBean的子类 */ def testHbasePutRDD: Unit = { val studentList = Student.newStudentList() val studentRDD = this.fire.createRDD(studentList, 2) // 为空的字段不插入 studentRDD.hbasePutRDD(this.tableName1) } /** * 使用HBaseConnector插入一个DataFrame的数据 */ def testHBasePutDF: Unit = { val studentList = Student.newStudentList() val studentDF = this.fire.createDataFrame(studentList, classOf[Student]) // 每个批次插100条 studentDF.hbasePutDF(this.tableName1, classOf[Student]) } /** * 使用HBaseConnector get数据,并将结果以RDD方式返回 */ def testHbaseGetRDD: Unit = { val getList = Seq("1", "2", "3", "5", "6") val getRDD = this.fire.createRDD(getList, 2) // 以多版本方式get,并将结果集封装到rdd中返回 val studentRDD = this.fire.hbaseGetRDD(this.tableName1, classOf[Student], getRDD) studentRDD.printEachPartition } /** * 使用HBaseConnector get数据,并将结果以DataFrame方式返回 */ def testHbaseGetDF: Unit = { val getList = Seq("1", "2", "3", "4", "5", "6") val getRDD = this.fire.createRDD(getList, 3) // get到的结果以dataframe形式返回 val studentDF = this.fire.hbaseGetDF(this.tableName1, classOf[Student], getRDD) studentDF.show(100, false) } /** * 使用HBaseConnector scan数据,并以RDD方式返回 */ def testHbaseScanRDD: Unit = { val rdd = this.fire.hbaseScanRDD2(this.tableName1, classOf[Student], "1", "6") rdd.repartition(3).printEachPartition } /** * 使用HBaseConnector scan数据,并以DataFrame方式返回 */ def testHbaseScanDF: Unit = { val dataFrame = this.fire.hbaseScanDF2(this.tableName1, classOf[Student], "1", "6") dataFrame.repartition(3).show(100, false) } ``` ### [1.2 bulk api](../fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/hbase/HbaseBulkTest.scala) ```scala /** * 使用bulk的方式将rdd写入到hbase */ def testHbaseBulkPutRDD: Unit = { // 方式一:将rdd的数据写入到hbase中,rdd类型必须为HBaseBaseBean的子类 val rdd = this.fire.createRDD(Student.newStudentList(), 2) // rdd.hbaseBulkPutRDD(this.tableName2) // 方式二:使用this.fire.hbaseBulkPut将rdd中的数据写入到hbase this.fire.hbaseBulkPutRDD(this.tableName2, rdd) // 第二个参数指定false表示不插入为null的字段到hbase中 // rdd.hbaseBulkPutRDD(this.tableName2, insertEmpty = false) // 第三个参数为true表示以多版本json格式写入 // rdd.hbaseBulkPutRDD(this.tableName3, false, true) } /** * 使用bulk的方式将DataFrame写入到hbase */ def testHbaseBulkPutDF: Unit = { // 方式一:将DataFrame的数据写入到hbase中 val rdd = this.fire.createRDD(Student.newStudentList(), 2) val studentDF = this.fire.createDataFrame(rdd, classOf[Student]) // insertEmpty=false表示为空的字段不插入 studentDF.hbaseBulkPutDF(this.tableName1, classOf[Student], keyNum = 2) // 方式二: // this.fire.hbaseBulkPutDF(this.tableName2, studentDF, classOf[Student]) } /** * 使用bulk方式根据rowKey获取数据,并将结果集以RDD形式返回 */ def testHBaseBulkGetRDD: Unit = { // 方式一:使用rowKey读取hbase中的数据,rowKeyRdd类型为String val rowKeyRdd = this.fire.createRDD(Seq(1.toString, 2.toString, 3.toString, 5.toString, 6.toString), 2) val studentRDD = rowKeyRdd.hbaseBulkGetRDD(this.tableName1, classOf[Student], keyNum = 2) studentRDD.foreach(println) // 方式二:使用this.fire.hbaseBulkGetRDD // val studentRDD2 = this.fire.hbaseBulkGetRDD(this.tableName2, rowKeyRdd, classOf[Student]) // studentRDD2.foreach(println) } /** * 使用bulk方式根据rowKey获取数据,并将结果集以DataFrame形式返回 */ def testHBaseBulkGetDF: Unit = { // 方式一:使用rowKey读取hbase中的数据,rowKeyRdd类型为String val rowKeyRdd = this.fire.createRDD(Seq(1.toString, 2.toString, 3.toString, 5.toString, 6.toString), 2) val studentDF = rowKeyRdd.hbaseBulkGetDF(this.tableName2, classOf[Student]) studentDF.show(100, false) // 方式二:使用this.fire.hbaseBulkGetDF val studentDF2 = this.fire.hbaseBulkGetDF(this.tableName2, rowKeyRdd, classOf[Student]) studentDF2.show(100, false) } /** * 使用bulk方式进行scan,并将结果集映射为RDD */ def testHbaseBulkScanRDD: Unit = { // scan操作,指定rowKey的起止或直接传入自己构建的scan对象实例,返回类型为RDD[Student] val scanRDD = this.fire.hbaseBulkScanRDD2(this.tableName2, classOf[Student], "1", "6") scanRDD.foreach(println) } /** * 使用bulk方式进行scan,并将结果集映射为DataFrame */ def testHbaseBulkScanDF: Unit = { // scan操作,指定rowKey的起止或直接传入自己构建的scan对象实例,返回类型为DataFrame val scanDF = this.fire.hbaseBulkScanDF2(this.tableName2, classOf[Student], "1", "6") scanDF.show(100, false) } ``` ### [1.3 spark api](../fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/hbase/HBaseHadoopTest.scala) ```scala /** * 基于saveAsNewAPIHadoopDataset封装,将rdd数据保存到hbase中 */ def testHbaseHadoopPutRDD: Unit = { val studentRDD = this.fire.createRDD(Student.newStudentList(), 2) this.fire.hbaseHadoopPutRDD(this.tableName2, studentRDD, keyNum = 2) // 方式二:直接基于rdd进行方法调用 // studentRDD.hbaseHadoopPutRDD(this.tableName1) } /** * 基于saveAsNewAPIHadoopDataset封装,将DataFrame数据保存到hbase中 */ def testHbaseHadoopPutDF: Unit = { val studentRDD = this.fire.createRDD(Student.newStudentList(), 2) val studentDF = this.fire.createDataFrame(studentRDD, classOf[Student]) // 由于DataFrame相较于Dataset和RDD是弱类型的数据集合,所以需要传递具体的类型classOf[Type] this.fire.hbaseHadoopPutDF(this.tableName3, studentDF, classOf[Student]) // 方式二:基于DataFrame进行方法调用 // studentDF.hbaseHadoopPutDF(this.tableName3, classOf[Student]) } /** * 使用Spark的方式scan海量数据,并将结果集映射为RDD */ def testHBaseHadoopScanRDD: Unit = { val studentRDD = this.fire.hbaseHadoopScanRDD2(this.tableName2, classOf[Student], "1", "6", keyNum = 2) studentRDD.printEachPartition } /** * 使用Spark的方式scan海量数据,并将结果集映射为DataFrame */ def testHBaseHadoopScanDF: Unit = { val studentDF = this.fire.hbaseHadoopScanDF2(this.tableName3, classOf[Student], "1", "6") studentDF.show(100, false) } ``` ## 四、flink任务 *[样例代码:](../fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/stream/HBaseTest.scala)* ```scala /** * table的hbase sink */ def testTableHBaseSink(stream: DataStream[Student]): Unit = { stream.createOrReplaceTempView("student") val table = this.flink.sqlQuery("select id, name, age from student group by id, name, age") // 方式一、自动将row转为对应的JavaBean // 注意:table对象上调用hbase api,需要指定泛型 table.hbasePutTable[Student](this.tableName).setParallelism(1) this.fire.hbasePutTable[Student](table, this.tableName2, keyNum = 2) // 方式二、用户自定义取数规则,从row中创建HBaseBaseBean的子类 table.hbasePutTable2[Student](this.tableName3)(row => new Student(1L, row.getField(1).toString, row.getField(2).toString.toInt)) // 或者 this.fire.hbasePutTable2[Student](table, this.tableName5, keyNum = 2)(row => new Student(1L, row.getField(1).toString, row.getField(2).toString.toInt)) } /** * table的hbase sink */ def testTableHBaseSink2(stream: DataStream[Student]): Unit = { val table = this.fire.sqlQuery("select id, name, age from student group by id, name, age") // 方式二、用户自定义取数规则,从row中创建HBaseBaseBean的子类 table.hbasePutTable2(this.tableName6)(row => new Student(1L, row.getField(1).toString, row.getField(2).toString.toInt)) // 或者 this.flink.hbasePutTable2(table, this.tableName7, keyNum = 2)(row => new Student(1L, row.getField(1).toString, row.getField(2).toString.toInt)) } /** * stream hbase sink */ def testStreamHBaseSink(stream: DataStream[Student]): Unit = { // 方式一、DataStream中的数据类型为HBaseBaseBean的子类 // stream.hbasePutDS(this.tableName) this.fire.hbasePutDS[Student](stream, this.tableName8) // 方式二、将value组装为HBaseBaseBean的子类,逻辑用户自定义 stream.hbasePutDS2(this.tableName9, keyNum = 2)(value => value) // 或者 this.fire.hbasePutDS2(stream, this.tableName10)(value => value) } /** * stream hbase sink */ def testStreamHBaseSink2(stream: DataStream[Student]): Unit = { // 方式二、将value组装为HBaseBaseBean的子类,逻辑用户自定义 stream.hbasePutDS2(this.tableName11)(value => value) // 或者 this.fire.hbasePutDS2(stream, this.tableName12, keyNum = 2)(value => value) } /** * hbase的基本操作 */ def testHBase: Unit = { // get操作 val getList = ListBuffer(HBaseConnector.buildGet("1")) val student = HBaseConnector.get(this.tableName, classOf[Student], getList, 1) if (student != null) println(JSONUtils.toJSONString(student)) // scan操作 val studentList = HBaseConnector.scan(this.tableName, classOf[Student], HBaseConnector.buildScan("0", "9"), 1) if (studentList != null) println(JSONUtils.toJSONString(studentList)) // delete操作 HBaseConnector.deleteRows(this.tableName, Seq("1")) } ``` ## 五、多集群读写 Fire框架支持同一个任务中对任意多个hbase集群进行读写,首先要在配置文件中以keyNum进行指定要连接的所有hbase集群的zk地址: ```scala @HBase("zk01:2181") @HBase2("zk02:2181") @HBase3("zk03:2181") ``` ```properties hbase.cluster=zk01:2181 hbase.cluster3=zk02:2181 hbase.cluster8=zk03:2181 ``` 在代码中,通过keyNum参数告诉fire这行代码连接的hbase集群是哪个。注意:api中的keyNum要与配置中的数字对应上。 ```scala // insert 操作 studentRDD.hbasePutRDD(this.tableName1) studentRDD.hbasePutRDD(this.tableName2, keyNum = 3) studentRDD.hbasePutRDD(this.tableName3, keyNum = 8) // scan 操作 this.fire.hbaseScanDF2(this.tableName1, classOf[Student], "1", "6") this.fire.hbaseScanDF2(this.tableName1, classOf[Student], "1", "6", keyNum = 3) ``` ## 六、@HBase ```java /** * HBase集群连接信息:hbase.cluster */ String value() default ""; /** * HBase集群连接信息:hbase.cluster,同value */ String cluster() default ""; /** * 列族名称:hbase.column.family */ String family() default ""; /** * 每个线程最多insert的记录数:fire.hbase.batch.size */ int batchSize() default -1; /** * spark引擎:scan hbase后存放到rdd的多少个partition中:fire.hbase.scan.partitions */ int scanPartitions() default -1; /** * spark引擎:scan后的缓存级别:fire.hbase.storage.level */ String storageLevel() default ""; /** * flink引擎:sink hbase失败最大重试次数:hbase.max.retry */ int maxRetries() default -1; /** * WAL等级:hbase.durability */ String durability() default ""; /** * 是否启用表信息缓存,提高表是否存在判断的效率:fire.hbase.table.exists.cache.enable */ boolean tableMetaCache() default true; /** * hbase-client参数,以key=value形式注明 */ String[] config() default ""; ``` ## 七、hbase-client参数 hbase-client参数,可以通过@HBase的**config**或以**fire.hbase.conf.**为前缀的参数去指定: ```scala @HBase(cluster = "test", config = Array[String]("hbase.rpc.timeout=60000ms", "hbase.client.scanner.timeout.period=60000ms")) ``` ```properties fire.hbase.conf.hbase.rpc.timeout = 60000ms fire.hbase.conf.hbase.client.scanner.timeout.period = 60000ms ``` | 参数名称 | 引擎 | 含义 | | ------------------------------------------- | ----- | ------------------------------------------------------ | | fire.hbase.batch.size | 通用 | insert的批次大小,用于限制单个task一次最多sink的记录数 | | hbase.column.family | 通用 | 用于配置列族名称,默认info | | hbase.max.retry | flink | 当插入失败后,重试多少次 | | hbase.cluster | 通用 | 所需读写的Hbase集群url或别名 | | hbase.durability | 通用 | Hbase-client中的durability | | fire.hbase.storage.level | spark | 诊断scan后数据的缓存,避免重复scan hbase | | fire.hbase.scan.partitions | Spark | 通过HBase scan后repartition的分区数 | | fire.hbase.cluster.map. | 通用 | hbase集群映射配置前缀 | | fire.hbase.table.exists.cache.enable | 通用 | 是否开启HBase表存在判断的缓存 | | fire.hbase.table.exists.cache.reload.enable | 通用 | 是否开启HBase表存在列表缓存的定时更新任务 | | fire.hbase.table.exists.cache.initialDelay | 通用 | 定时刷新缓存HBase表任务的初始延迟 | | fire.hbase.table.exists.cache.period | 通用 | 定时刷新缓存HBase表任务的执行频率 | | fire.hbase.conf. | 通用 | hbase java api 配置前缀,支持任意hbase-client的参数 | ================================================ FILE: docs/connector/hive.md ================================================ # hive集成与配置 使用fire框架,仅需一行配置即可实现spark&flink与hive的无缝读写,甚至支持跨集群的读写(实时任务与hive不再同一个集群中)。 ### 一、基于注解 ```scala // 指定thrift server url地址,多个以逗号分隔 @Hive("thrift://thrift01:9083") // 指定thrift对应的别名,别名配置方式同kafka别名配置 @Hive("test") ``` ### 二、配置文件 ```properties # 方式一:直接指定hive的thrift server地址,多个以逗号分隔 spark.hive.cluster = thrift://hive01:9083,thrift://hive02:9083 # 方式二(推荐):如果已经通过fire.hive.cluster.map.xxx指定了别名,则可以直接使用别名 # 公共信息特别是集群信息建议放到commons.properties中 fire.hive.cluster.map.batch = thrift://hive03:9083,thrift://hive04:9083 # batch是上述url的hive别名,支持约定多个hive集群的别名 spark.hive.cluster = batch ``` ### 三、[示例代码](../fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/hive/HiveClusterReader.scala) ```scala // 通过上述配置,代码中就可以直接通过以下方式连接指定的hive this.fire.sql("select * from hive.tableName").show ``` ### 四、高可用   NameNode主备切换会导致那些读写hive的spark streaming任务挂掉。为了提高灵活性,避免将core-site.xml与hdfs-site.xml放到工程的resources目录下,fire提供了配置的方式,将Name Node HA信息通过配置文件进行指定。每项配置中的batch对应fire.hive.cluster.map.batch所指定的别名:batch,其他信息根据集群不同进行单独配置。如果有多个hive集群,可以配置多套HA配置。 ```properties # 用于是否启用HDFS HA spark.hdfs.ha.enable = true # 离线hive集群的HDFS HA配置项,规则为统一的ha前缀:spark.hdfs.ha.conf.+hive.cluster名称+hdfs专门的ha配置 spark.hdfs.ha.conf.batch.fs.defaultFS = hdfs://nameservice1 spark.hdfs.ha.conf.batch.dfs.nameservices = nameservice1 spark.hdfs.ha.conf.batch.dfs.ha.namenodes.nameservice1 = namenode5231,namenode5229 spark.hdfs.ha.conf.batch.dfs.namenode.rpc-address.nameservice1.namenode5231 = namenode01:8020 spark.hdfs.ha.conf.batch.dfs.namenode.rpc-address.nameservice1.namenode5229 = namenode02:8020 spark.hdfs.ha.conf.batch.dfs.client.failover.proxy.provider.nameservice1 = org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider ``` ### 五、hive set参数 ```properties # 以spark.hive.conf.为前缀的配置将直接生效,比如开启hive动态分区 # 原理是被直接执行:this.fire.sql("set hive.exec.dynamic.partition=true") spark.hive.conf.hive.exec.dynamic.partition = true spark.hive.conf.hive.exec.dynamic.partition.mode = nonstrict spark.hive.conf.hive.exec.max.dynamic.partitions = 5000 ``` ### 六、@Hive注解 ```java /** * hive连接别名:hive.cluster */ String value() default ""; /** * hive连接别名:hive.cluster,同value */ String cluster() default ""; /** * hive的版本:hive.version */ String version() default ""; /** * 在flink中hive的catalog名称:hive.catalog.name */ String catalog() default ""; /** * 分区名称(dt、ds):default.table.partition.name */ String partition() default ""; ``` ### 七、个性化配置 | 参数名称 | 引擎 | 含义 | | ---------------------------------- | ----- | ---------------------------------- | | flink.hive.version | flink | flink所集成的hive版本号 | | flink.default.database.name | flink | tmp | | flink.default.table.partition.name | flink | 默认的hive分区字段名称 | | flink.hive.catalog.name | flink | hive的catalog名称 | | fire.hive.cluster.map. | 通用 | hive thrift url别名映射 | | hive.conf. | 通用 | 通过固定的前缀配置支持所有hive参数 | ================================================ FILE: docs/connector/jdbc.md ================================================ # JDBC读写   实时任务开发中,对jdbc读写的需求很高。为了简化jdbc开发步骤,fire框架对jdbc操作做了进一步封装,将许多常见操作简化成一行代码。另外,fire框架支持在同一个任务中对任意多个数据源进行读写。 ### 一、数据源配置 #### 1.1 基于注解 ```scala @Jdbc(url = "jdbc:derby:memory:fire;create=true", username = "fire", password = "fire") @Jdbc3(url = "jdbc:derby:memory:fire;create=true", username = "fire", maxPoolSize=3, config=Array[String]("c3p0.key=value")) ``` #### 1.2 基于配置文件   数据源包括jdbc的url、driver、username与password等重要信息,建议将这些配置放到commons.properties中,避免每个任务单独配置。fire框架内置了c3p0数据库连接池,在分布式场景下,限制每个container默认最多3个connection,避免申请过多资源时申请太多的数据库连接。 ```properties db.jdbc.url = jdbc:derby:memory:fire;create=true db.jdbc.driver = org.apache.derby.jdbc.EmbeddedDriver db.jdbc.maxPoolSize = 3 db.jdbc.user = fire db.jdbc.password = fire # 如果需要多个数据源,则可在每项配置的结尾添加对应的keyNum作为区分 db.jdbc.url2 = jdbc:mysql://mysql:3306/fire db.jdbc.driver2 = com.mysql.jdbc.Driver db.jdbc.user2 = fire db.jdbc.password2 = fire ``` ### 二、API使用 #### [2.1 spark任务](../fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/jdbc/JdbcTest.scala) ```scala /** * 使用jdbc方式对关系型数据库进行增删改操作 */ def testJdbcUpdate: Unit = { val timestamp = DateFormatUtils.formatCurrentDateTime() // 执行insert操作 val insertSql = s"INSERT INTO $tableName (name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" this.fire.jdbcUpdate(insertSql, Seq("admin", 12, timestamp, 10.0, 1)) // 更新配置文件中指定的第二个关系型数据库 this.fire.jdbcUpdate(insertSql, Seq("admin", 12, timestamp, 10.0, 1), keyNum = 2) // 执行更新操作 val updateSql = s"UPDATE $tableName SET name=? WHERE id=?" this.fire.jdbcUpdate(updateSql, Seq("root", 1)) // 执行批量操作 val batchSql = s"INSERT INTO $tableName (name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" this.fire.jdbcBatchUpdate(batchSql, Seq(Seq("spark1", 21, timestamp, 100.123, 1), Seq("flink2", 22, timestamp, 12.236, 0), Seq("flink3", 22, timestamp, 12.236, 0), Seq("flink4", 22, timestamp, 12.236, 0), Seq("flink5", 27, timestamp, 17.236, 0))) // 执行批量更新 this.fire.jdbcBatchUpdate(s"update $tableName set sex=? where id=?", Seq(Seq(1, 1), Seq(2, 2), Seq(3, 3), Seq(4, 4), Seq(5, 5), Seq(6, 6))) // 方式一:通过this.fire方式执行delete操作 val sql = s"DELETE FROM $tableName WHERE id=?" this.fire.jdbcUpdate(sql, Seq(2)) // 方式二:通过JdbcConnector.executeUpdate // 同一个事务 /*val connection = this.jdbc.getConnection() this.fire.jdbcBatchUpdate("insert", connection = connection, commit = false, closeConnection = false) this.fire.jdbcBatchUpdate("delete", connection = connection, commit = false, closeConnection = false) this.fire.jdbcBatchUpdate("update", connection = connection, commit = true, closeConnection = true)*/ } /** * 将DataFrame数据写入到关系型数据库中 */ def testDataFrameSave: Unit = { val df = this.fire.createDataFrame(Student.newStudentList(), classOf[Student]) val insertSql = s"INSERT INTO spark_test(name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" // 指定部分DataFrame列名作为参数,顺序要对应sql中问号占位符的顺序,batch用于指定批次大小,默认取spark.db.jdbc.batch.size配置的值 df.jdbcBatchUpdate(insertSql, Seq("name", "age", "createTime", "length", "sex"), batch = 100) df.createOrReplaceTempViewCache("student") val sqlDF = this.fire.sql("select name, age, createTime from student where id>=1").repartition(1) // 若不指定字段,则默认传入当前DataFrame所有列,且列的顺序与sql中问号占位符顺序一致 sqlDF.jdbcBatchUpdate("insert into spark_test(name, age, createTime) values(?, ?, ?)") // 等同以上方式 // this.fire.jdbcBatchUpdateDF(sqlDF, "insert into spark_test(name, age, createTime) values(?, ?, ?)") } ``` #### [2.2 flink任务](../fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/stream/JdbcTest.scala) ```scala /** * table的jdbc sink */ def testTableJdbcSink(stream: DataStream[Student]): Unit = { stream.createOrReplaceTempView("student") val table = this.fire.sqlQuery("select name, age, createTime, length, sex from student group by name, age, createTime, length, sex") // 方式一、table中的列顺序和类型需与jdbc sql中的占位符顺序保持一致 table.jdbcBatchUpdate(sql(this.tableName)).setParallelism(1) // 或者 this.fire.jdbcBatchUpdateTable(table, sql(this.tableName), keyNum = 6).setParallelism(1) // 方式二、自定义row取数规则,适用于row中的列个数和顺序与sql占位符不一致的情况 table.jdbcBatchUpdate2(sql(this.tableName), flushInterval = 10000, keyNum = 7)(row => { Seq(row.getField(0), row.getField(1), row.getField(2), row.getField(3), row.getField(4)) }) // 或者 this.flink.jdbcBatchUpdateTable2(table, sql(this.tableName), keyNum = 8)(row => { Seq(row.getField(0), row.getField(1), row.getField(2), row.getField(3), row.getField(4)) }).setParallelism(1) } /** * stream jdbc sink */ def testStreamJdbcSink(stream: DataStream[Student]): Unit = { // 方式一、指定字段列表,内部根据反射,自动获取DataStream中的数据并填充到sql中的占位符 // 此处fields有两层含义:1. sql中的字段顺序(对应表) 2. DataStream中的JavaBean字段数据(对应JavaBean) // 注:要保证DataStream中字段名称是JavaBean的名称,非表中字段名称 顺序要与占位符顺序一致,个数也要一致 stream.jdbcBatchUpdate(sql(this.tableName2), fields).setParallelism(3) // 或者 this.fire.jdbcBatchUpdateStream(stream, sql(this.tableName2), fields, keyNum = 6).setParallelism(1) // 方式二、通过用户指定的匿名函数方式进行数据的组装,适用于上面方法无法反射获取值的情况,适用面更广 stream.jdbcBatchUpdate2(sql(this.tableName2), 3, 30000, keyNum = 7) { // 在此处指定取数逻辑,定义如何将dstream中每列数据映射到sql中的占位符 value => Seq(value.getName, value.getAge, DateFormatUtils.formatCurrentDateTime(), value.getLength, value.getSex) }.setParallelism(1) // 或者 this.flink.jdbcBatchUpdateStream2(stream, sql(this.tableName2), keyNum = 8) { value => Seq(value.getName, value.getAge, DateFormatUtils.formatCurrentDateTime(), value.getLength, value.getSex) }.setParallelism(2) } ``` ### 三、多个数据源读写 Fire框架支持同一个任务中读写任意个数的数据源,只需要通过keyNum指定即可。配置和使用方式可以参考:HBase、kafka等。 ### 四、@JDBC ```java /** * Jdbc的url,同value */ String url(); /** * jdbc 驱动类,不填可根据url自动推断 */ String driver() default ""; /** * jdbc的用户名 */ String username(); /** * jdbc的密码 */ String password() default ""; /** * 事务的隔离级别 */ String isolationLevel() default ""; /** * 连接池的最大连接数 */ int maxPoolSize() default -1; /** * 连接池最少连接数 */ int minPoolSize() default -1; /** * 连接池初始连接数 */ int initialPoolSize() default -1; /** * 连接池的增量 */ int acquireIncrement() default -1; /** * 连接的最大空闲时间 */ int maxIdleTime() default -1; /** * 多少条操作一次 */ int batchSize() default -1; /** * flink引擎:flush的间隔周期(ms) */ long flushInterval() default -1; /** * flink引擎:失败最大重试次数 */ int maxRetries() default -1; /** * spark引擎:scan后的缓存级别:fire.jdbc.storage.level */ String storageLevel() default ""; /** * spark引擎:select后存放到rdd的多少个partition中:fire.jdbc.query.partitions */ int queryPartitions() default -1; /** * 日志中打印的sql长度 */ int logSqlLength() default -1; /** * c3p0参数,以key=value形式注明 */ String[] config() default ""; ``` ### 五、配置参数 列表中的配置参数可根据需要放到任务的配置文件中。 | 参数名称 | 引擎 | 含义 | | ------------------------ | ---- | ---------------------------------------- | | db.jdbc.url | 通用 | jdbc url | | db.jdbc.url.map. | 通用 | 用于为url取别名 | | db.jdbc.driver | 通用 | driver class | | db.jdbc.user | 通用 | 数据库用户名 | | db.jdbc.password | 通用 | 数据库密码 | | db.jdbc.isolation.level | 通用 | 事务的隔离级别 | | db.jdbc.maxPoolSize | 通用 | 连接池最大连接数 | | db.jdbc.minPoolSize | 通用 | 连接池最小连接数 | | db.jdbc.acquireIncrement | 通用 | 当连接池连接数不足时,增量申请连接数大小 | ================================================ FILE: docs/connector/kafka.md ================================================ # Kafka 数据源 ### 一、API使用 使用fire框架可以很方便的消费kafka中的数据,并且支持在同一任务中消费多个kafka集群的多个topic。核心代码仅一行: ```scala // Spark Streaming任务 val dstream = this.fire.createKafkaDirectStream() // structured streaming任务 val kafkaDataset = this.fire.loadKafkaParseJson() // flink 任务 val dstream = this.fire.createKafkaDirectStream() ``` 以上的api均支持kafka相关参数的传入,但fire推荐将这些集群信息放到配置文件中,增强代码可读性,提高代码简洁性与灵活性。 ### 二、kafka配置   你可能会疑惑,kafka的broker与topic信息并没有在代码中指定,程序是如何消费的呢?其实,这些信息都放到了任务同名的配置文件中。当然,你可以选择将这些kafka信息放到代码中指定。如果代码中指定了集群信息,同时配置文件中也有指定,则配置文件的优先级更高。 #### 2.1 定义别名   建议将kafka集群url信息定义成别名,别名定义放到名为common.properties的配置文件中。别名的好处是一处维护到处生效,方便共用,便于记忆。 ```properties # 以下定义了两个kafka集群的别名,分别叫mq和test,别名与定义的url对应 fire.kafka.cluster.map.mq = kafka01:9092,kafka02:9092,kafka03:9092 fire.kafka.cluster.map.test = kafka-test01:9092,kafka-test02:9092,kafka-test03:9092 ``` #### 2.2 基于注解配置 定义好别名以后,就可以使用注解的方式去配置kafka集群信息了,fire框架支持一个任务读写多个kafka: ```scala @Kafka(brokers = "mq", topics = "fire", groupId = "fire") @Kafka2(brokers = "test", topics = "fire", groupId = "fire", sessionTimeout = 600000, autoCommit = false) ``` #### 2.3 基于配置文件配置 ```properties spark.kafka.brokers.name = mq # 必须配置项:kafka的topic列表,以逗号分隔 spark.kafka.topics = fire # 用于指定groupId,如果不指定,则默认为当前类名 spark.kafka.group.id = fire # 配置消费明为test的kafka机器,注意key的后缀统一添加2,用于标识不同的kafka集群 spark.kafka.brokers.name2 = test # 必须配置项:kafka的topic列表,以逗号分隔 spark.kafka.topics2 = fire # 用于指定groupId,如果不指定,则默认为当前类名 spark.kafka.group.id2 = fire ``` ### 三、多kafka多topic消费 代码中是如何关联带有数字后缀的key的呢?答案是通过keyNum参数来指定: ```scala // 对应spark.kafka.brokers.name=mq 或 @Kafka这个kafka("mq")集群,如果不知道keyNum,默认为1 val dstream = this.fire.createKafkaDirectStream() // 对应spark.kafka.brokers.name2=test 或 @Kafka2("test")这个kafka集群 val dstream2 = this.fire.createKafkaDirectStream(keyNum=2) ``` ### 三、offset提交 #### 3.1 主动提交 ```scala dstream.kafkaCommitOffsets() ``` #### 3.2 自动提交   Spark streaming在处理数据过程中,由于offset提交与数据处理可能不再一个算子中,就会出现stage失败,数据丢失,但offset却提交了。为了解决这个问题,fire框架提供了***foreachRDDAtLeastOnce***算子,来保证计算的数据不丢,失败重试(默认3次),成功自动提交等特性。 ```scala @Streaming(20) // spark streaming的批次时间 @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object AtLeastOnceTest extends BaseSparkStreaming { override def process: Unit = { val dstream = this.fire.createKafkaDirectStream() // 至少一次的语义保证,处理成功自动提交offset,处理失败会重试指定次数,如果仍失败则任务退出 dstream.foreachRDDAtLeastOnce(rdd => { val studentRDD = rdd.map(t => JSONUtils.parseObject[Student](t.value())).repartition(2) val insertSql = s"INSERT INTO spark_test(name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" println("kafka.brokers.name=>" + this.conf.getString("kafka.brokers.name")) studentRDD.toDF().jdbcBatchUpdate(insertSql, Seq("name", "age", "createTime", "length", "sex"), batch = 1) }) this.fire.start } } ``` ### 五、kafka-client参数调优 针对与kafka-client个性化的参数,需要使用config来进行配置: ```scala @Kafka(brokers = "kafka01:9092", config = Array[String]("session.timeout.ms=30000", "request.timeout.ms=30000")) ``` 基于配置文件的话使用kafka.conf开头加上kafka-client参数即可: ```properties # 以kafka.conf开头的配置支持所有kafka client的配置 kafka.conf.session.timeout.ms = 300000 kafka.conf.request.timeout.ms = 400000 kafka.conf.session.timeout.ms2 = 300000 ``` ### 六、代码示例 [1. spark消费kafka demo](../fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/streaming/KafkaTest.scala) [2. flink消费kafka demo](../fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/stream/HBaseTest.scala) ### 七、@Kafka注解 ```java /** * kafka集群连接信息,同value */ String brokers(); /** * kafka topics,多个使用逗号分隔 */ String topics(); /** * 消费者标识 */ String groupId(); /** * 指定从何处开始消费 */ String startingOffset() default ""; /** * 指定消费到何处结束 */ String endingOffsets() default ""; /** * 是否开启主动提交offset */ boolean autoCommit() default false; /** * session超时时间(ms) */ long sessionTimeout() default -1; /** * request超时时间(ms) */ long requestTimeout() default -1; /** * poll的周期(ms) */ long pollInterval() default -1; /** * 从指定的时间戳开始消费 */ long startFromTimestamp() default -1; /** * 指定从kafka中保持的offset开始继续消费 */ boolean startFromGroupOffsets() default false; /** * 是否强制覆盖checkpoint中保持的offset信息,从指定位置开始消费 */ boolean forceOverwriteStateOffset() default false; /** * 是否在开启checkpoint的情况下强制周期性提交offset到kafka */ boolean forceAutoCommit() default false; /** * 强制提交的周期(ms) */ long forceAutoCommitInterval() default -1; /** * kafka-client参数,以key=value形式注明 */ String[] config() default ""; ``` ### 八、配置参数 | 参数名称 | 引擎 | 含义 | | ---------------------------------------- | ----- | ------------------------------------------------------------ | | fire.kafka.cluster.map. | 通用 | 用于定义kafka集群别名 | | kafka.conf. | 通用 | 用于设置kafka-client参数 | | kafka.brokers.name | 通用 | 指定消费的kafka集群url或别名 | | kafka.topics | 通用 | kafka的topic列表,以逗号分隔 | | kafka.group.id | 通用 | 消费kafka的group id | | kafka.starting.offsets | 通用 | kafka起始消费位点 | | kafka.ending.offsets | 通用 | kafka结束消费位点 | | kafka.enable.auto.commit | 通用 | 是否自动维护offset | | kafka.failOnDataLoss | 通用 | 丢失数据是否失败 | | kafka.session.timeout.ms | 通用 | kafka session超时时间 | | kafka.request.timeout.ms | 通用 | kafka request超时时间 | | kafka.max.poll.interval.ms | 通用 | kafka的最大poll周期 | | kafka.CommitOffsetsOnCheckpoints | flink | 当checkpoint时是否提交offset | | kafka.StartFromTimestamp | flink | 从指定时间戳开始消费 | | kafka.StartFromGroupOffsets | flink | 从指定offset开始消费 | | kafka.force.overwrite.stateOffset.enable | flink | 是否使状态中存放的offset不生效(请谨慎配置,用于kafka集群迁移等不正常状况的运维) | | kafka.force.autoCommit.enable | flink | 是否在开启checkpoint的情况下强制开启周期性offset提交 | | kafka.force.autoCommit.Interval | Flink | 周期性提交offset的时间间隔(ms) | ================================================ FILE: docs/connector/oracle.md ================================================ ## Flink oracle connector *Flink oracle connector基于jdbc sql connector改造,使用方法同flink标准的jdbc sql connector,fire框架能根据jdbc url自动识别是mysql还是oracle。* ================================================ FILE: docs/connector/rocketmq.md ================================================ # RocketMQ消息接入 ### 一、API使用 使用fire框架可以很方便的消费rocketmq中的数据,并且支持在同一任务中消费多个rocketmq集群的多个topic。核心代码仅一行: ```scala // Spark Streaming或flink streaming任务 val dstream = this.fire.createRocketMqPullStream() ``` 以上的api均支持rocketmq相关参数的传入,但fire推荐将这些集群信息放到配置文件中,增强代码可读性,提高代码简洁性与灵活性。 ### 二、flink sql connector ```scala this.fire.sql(""" |CREATE table source ( | id bigint, | name string, | age int, | length double, | data DECIMAL(10, 5) |) WITH | ( | 'connector' = 'fire-rocketmq', | 'format' = 'json', | 'rocket.brokers.name' = 'zms', | 'rocket.topics' = 'fire', | 'rocket.group.id' = 'fire', | 'rocket.consumer.tag' = '*' | ) |""".stripMargin) ``` **with参数的使用:**   Rocketmq sql connector中的with参数复用了api中的配置参数,如果需要进行rocketmq-client相关参数设置,可以以rocket.conf.为前缀,后面跟上rocketmq调优参数即可。 ### 二、RocketMQ配置 #### 2.1 基于注解 ```scala @RocketMQ(brokers = "bigdata_test", topics = "fire", groupId = "fire", tag = "*") @RocketMQ2(brokers = "bigdata_test", topics = "fire2", groupId = "fire2", tag = "*", startingOffset = "latest") ``` #### 2.2 基于配置文件 ```properties spark.rocket.brokers.name = rocketmq01:9876;rocketmq02:9876 spark.rocket.topics = topic_name spark.rocket.group.id = groupId spark.rocket.pull.max.speed.per.partition = 15000 spark.rocket.consumer.tag = * # 以spark.rocket.conf开头的配置支持所有rocket client的配置 spark.rocket.conf.pull.max.speed.per.partition = 5000 ``` ### 三、多RocketMQ多topic消费   实际生产场景下,会有同一个任务消费多个RocketMQ集群,多个topic的情况。面对这种需求,fire是如何应对的呢?fire框架约定,配置的key后缀区分不同的RocketMQ配置项,详见以下配置列表: ```properties # 以下配置中指定了两个RocketMQ集群信息 spark.rocket.brokers.name = localhost:9876;localhost02:9876 spark.rocket.topics = topic_name spark.rocket.consumer.instance = FireFramework spark.rocket.group.id = groupId # 注意key的数字后缀,对应代码中的keyNum=2 spark.rocket.brokers.name2 = localhost:9876;localhost02:9876 spark.rocket.topics2 = topic_name2 spark.rocket.consumer.instance2 = FireFramework spark.rocket.group.id2 = groupId2 ``` 那么,代码中是如何关联带有数字后缀的key的呢?答案是通过keyNum参数来指定: ```scala // 对应spark.rocket.brokers.name这个RocketMQ集群 val dstream = this.fire.createRocketMqPullStream(keyNum=1) // 对应spark.rocket.brokers.name2这个RocketMQ集群 val dstream2 = this.fire.createRocketMqPullStream(keyNum=2) ``` ### 四、RocketMQ-client参数调优 有时,需要对RocketMQ消费进行client端的调优,fire支持所有的RocketMQ-client参数,这些参数只需要添加到配置文件中即可生效: ```properties # 以spark.rocket.conf开头的配置支持所有rocket client的配置 spark.rocket.conf.pull.max.speed.per.partition = 5000 ``` ### 五、offset提交 #### 5.1 主动提交 ```scala dstream.rocketCommitOffsets() ``` #### 5.2 自动提交   Spark streaming在处理数据过程中,由于offset提交与数据处理可能不再一个算子中,就会出现stage失败,数据丢失,但offset却提交了。为了解决这个问题,fire框架提供了***foreachRDDAtLeastOnce***算子,来保证计算的数据不丢,失败重试(默认3次),成功自动提交等特性。 ```scala @Streaming(20) // spark streaming的批次时间 @RocketMQ(brokers = "bigdata_test", topics = "fire", groupId = "fire", tag = "*") object AtLeastOnceTest extends BaseSparkStreaming { override def process: Unit = { val dstream = this.fire.createRocketMqPullStream() // 至少一次的语义保证,处理成功自动提交offset,处理失败会重试指定次数,如果仍失败则任务退出 dstream.foreachRDDAtLeastOnce(rdd => { val studentRDD = rdd.map(t => JSONUtils.parseObject[Student](t.value())).repartition(2) val insertSql = s"INSERT INTO spark_test(name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" println("kafka.brokers.name=>" + this.conf.getString("kafka.brokers.name")) studentRDD.toDF().jdbcBatchUpdate(insertSql, Seq("name", "age", "createTime", "length", "sex"), batch = 1) }) this.fire.start } } ``` ### 五、代码示例 [1. spark示例代码](../fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/streaming/RocketTest.scala) [2. flink streaming示例代码](../fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/connector/rocketmq/RocketTest.scala) [3. flink sql connector示例代码](../fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/connector/rocketmq/RocketMQConnectorTest.scala) ### 六、@RocketMQ ```java /** * rocketmq集群连接信息 */ String brokers(); /** * rocketmq topics,多个使用逗号分隔 */ String topics(); /** * 消费者标识 */ String groupId(); /** * 指定消费的tag */ String tag() default "*"; /** * 指定从何处开始消费 */ String startingOffset() default ""; /** * 是否开启主动提交offset */ boolean autoCommit() default false; /** * RocketMQ-client参数,以key=value形式注明 */ String[] config() default ""; ``` ### 七、配置参数 | 参数名称 | 引擎 | 含义 | | ----------------------------------- | ----- | ------------------------------------------------------------ | | fire.rocket.cluster.map. | 通用 | 用于配置rocketmq集群别名 | | rocket.conf. | 通用 | 通过约定固定的前缀,支持rocketmq-client的所有参数 | | rocket.brokers.name | 通用 | nameserver 地址或别名 | | rocket.topics | 通用 | 主题名称 | | rocket.group.id | 通用 | 消费者id | | rocket.failOnDataLoss | 通用 | 丢失数据是否失败 | | rocket.forceSpecial | 通用 | 如果 forceSpecial 为true,rocketmq 无论如何都会从特定的可用偏移量开始消费 | | rocket.enable.auto.commit | 通用 | 是否自动提交offset | | rocket.starting.offsets | 通用 | RocketMQ起始消费位点 | | rocket.consumer.tag | 通用 | rocketMq订阅的tag | | rocket.pull.max.speed.per.partition | spark | 每次拉取每个partition的消息数 | | rocket.consumer.instance | spark | 用于区分不同的消费者实例 | | ocket.sink.parallelism | flink | sink的并行度 | ================================================ FILE: docs/datasource.md ================================================ # Spark DataSource增强   Spark DataSource API很强大,为了进一步增强灵活性,Fire框架针对DataSource API做了进一步封装,允许将options等信息放到配置文件中,提高灵活性,如果与实时平台的配置中心集成,可做到重启即完成调优。 [示例程序:](../fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/datasource/DataSourceTest.scala) ```scala val ds = this.fire.createDataFrame(Student.newStudentList(), classOf[Student]) ds.createOrReplaceTempView("test") val dataFrame = this.fire.sql("select * from test") // 一、 dataFrame.write.format.mode.save中的所有参数均可通过配置文件指定 // dataFrame.writeEnhance() // 二、 dataFrame.write.mode.save中部分参数通过配置文件指定,或全部通过方法硬编码指定 val savePath = "/user/hive/warehouse/hudi.db/hudi_bill_event_test" // 如果代码中与配置文件中均指定了option,则相同的options配置文件优先级更高,不同的option均生效 val options = Map( "hoodie.datasource.write.recordkey.field" -> "id", "hoodie.datasource.write.precombine.field" -> "id" ) // 使用keyNum标识读取配置文件中不同配置后缀的options信息 // dataFrame.writeEnhance("org.apache.hudi", SaveMode.Append, savePath, options = options, keyNum = 2) // read.format.mode.load(path) this.fire.readEnhance(keyNum = 3) ``` 配置文件: ```properties # 一、hudi datasource,全部基于配置文件进行配置 spark.datasource.format=org.apache.hudi spark.datasource.saveMode=Append # 用于区分调用save(path)还是saveAsTable spark.datasource.isSaveTable=false # 传入到底层save或saveAsTable方法中 spark.datasource.saveParam=/user/hive/warehouse/hudi.db/hudi_bill_event_test # 以spark.datasource.options.为前缀的配置用于配置hudi相关的参数,可覆盖代码中同名的配置 spark.datasource.options.hoodie.datasource.write.recordkey.field=id spark.datasource.options.hoodie.datasource.write.precombine.field=id spark.datasource.options.hoodie.datasource.write.partitionpath.field=ds spark.datasource.options.hoodie.table.name=hudi.hudi_bill_event_test spark.datasource.options.hoodie.datasource.write.hive_style_partitioning=true spark.datasource.options.hoodie.datasource.write.table.type=MERGE_ON_READ spark.datasource.options.hoodie.insert.shuffle.parallelism=128 spark.datasource.options.hoodie.upsert.shuffle.parallelism=128 spark.datasource.options.hoodie.fail.on.timeline.archiving=false spark.datasource.options.hoodie.clustering.inline=true spark.datasource.options.hoodie.clustering.inline.max.commits=8 spark.datasource.options.hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 spark.datasource.options.hoodie.clustering.plan.strategy.small.file.limit=629145600 spark.datasource.options.hoodie.clustering.plan.strategy.daybased.lookback.partitions=2 # 二、配置第二个数据源,以数字后缀作为区分,部分使用配置文件进行配置 spark.datasource.format2=org.apache.hudi2 spark.datasource.saveMode2=Overwrite # 用于区分调用save(path)还是saveAsTable spark.datasource.isSaveTable2=false # 传入到底层save或saveAsTable方法中 spark.datasource.saveParam2=/user/hive/warehouse/hudi.db/hudi_bill_event_test2 # 三、配置第三个数据源,用于代码中进行read操作 spark.datasource.format3=org.apache.hudi3 spark.datasource.loadParam3=/user/hive/warehouse/hudi.db/hudi_bill_event_test3 spark.datasource.options.hoodie.datasource.write.recordkey.field3=id3 ``` ================================================ FILE: docs/dev/config.md ================================================ # 参数配置   实时计算任务调优参数多种多样,繁杂众多,使用fire框架可以很方便的进行各种参数的设置。在方便开发者开发和调优的同时,业务平台集成提供了配置接口,实现平台化的配置管理。 ### 1. 用户参数配置   Fire框架支持基于接口、apollo、配置文件以及注解等多种方式配置,支持将spark&flink等**引擎参数**、**[fire框架参数](properties.md)**以及**用户自定义参数**混合配置,支持运行时动态修改配置。几种常用配置方式如下([*fire内置参数*](properties.md)): - **基于配置文件:** 创建类名同名的properties文件进行参数配置 - **基于接口配置:** fire框架提供了配置接口调用,通过接口获取所需的配置,可用于平台化的配置管理 - **基于注解配置:** 通过注解的方式实现集群环境、connector、调优参数的配置 #### 1.1 基于注解 ```scala // 通用的配置注解,支持任意的参数,还可以替代connector(如@Hive、@Kafka)类型参数,支持注释和多行配置 @Config( """ |# 支持Flink调优参数、Fire框架参数、用户自定义参数等 |state.checkpoints.num-retained=30 |state.checkpoints.dir=hdfs:///user/flink/checkpoint |my.conf=hello |""") // 配置连接到指定的hive,支持别名:@Hive("test"),别名需在cluster.properties中指定 @Hive("thrift://localhost:9083") // 100s做一次checkpoint,开启非对齐checkpoint,还支持checkpoint其他设置,如超时时间,两次checkpoint间隔时间等 @Checkpoint(interval = 100, unaligned = true) // 配置kafka connector,多个kafka消费通过不同数值后缀区分:@Kafka2、@Kafka3、@Kafka5等,支持url或别名 @Kafka(brokers = "localhost:9092", topics = "fire", groupId = "fire") // 配置rocketmq connector,同样支持消费多个rocketmq,支持url或别名 @RocketMQ(brokers = "bigdata_test", topics = "fire", groupId = "fire", tag = "*", startingOffset = "latest") // jdbc注解,可自动推断driverClass,支持配置多个jdbc数据源,支持url或别名 @Jdbc(url = "jdbc:mysql://mysql-server:3306/fire", username = "root", password = "..root726") // 配置Hbase数据源,支持配置多HBase集群读写,支持url或别名 @HBase("localhost:2181") ``` #### 1.2 基于配置文件   Fire框架约定,在任务启动时自动加载与该任务同名的,位于resources目录下以.properties结尾的配置文件(支持目录)。配置文件中如果定义了与@Config注解或者其他配置注解相同的配置时,配置文件中的优先级更高。[*fire框架参数*](properties.md)   另外,如果同一个项目中有多个任务共用一些配置信息,比如jdbc url、hbase集群地址等,可以将这些公共的配置放到resources目录下名为**common.properties**配置文件中。这样每个任务在启动前会先加载这个配置文件,实现配置复用。common.properties中的配置优先级低于任务级别的配置。 #### 1.3 基于平台   上述两种,无论是基于注解还是基于配置文件,修改参数时,都需要修改代码然后重新编译发布执行。为了节约开发时间,fire框架提供了参数设置接口,实时平台可通过接口调用的方式将web页面中任务级别的配置设置到不同的任务中,以此来实现在web页面中进行实时任务的调优。接口调用的参数优先级要高于配置文件和注解方式。 #### 1.4 配置热更新   集成了fire框架的spark或flink任务,支持在运行时动态修改用户配置。比如想修改一个运行中任务的jdbc batch的大小,可以通过fire框架提供的配置热更新接口来实现。当接口调用后,fire框架会将最新的配置信息分布式的同步给spark的每一个executor以及flink的每一个taskmanager。 ### 2. 配置获取   Fire框架封装了统一的配置获取api,基于该api,无论是spark还是flink,无论是在Driver | JobManager端还是在Executor | TaskManager端,都可以直接一行代码获取所需配置。这套配置获取api,无需再在flink的map等算子中复写open方法了,用起来十分方便。 ```scala this.conf.getString("my.conf") this.conf.getInt("state.checkpoints.num-retained") ... ``` ### 3. 实时平台配置   Fire框架是实时计算任务与实时平台之间沟通的桥梁,在设计之初,就充分考虑了与实时平台的集成。对于一些集群连接等敏感配置等配置,可通过配置中心来实现统一的约束。比如当迁移hive thrift地址时,可以在配置中心修改该地址,然后将配置的优先级调高为紧急,再通知对应实时任务重启任务即可实现hive thrift地址的统一修改。定义为紧急的配置,优先级是最高的,这样变实现了实时平台配置的统一兜底管理。 ### 4. 配置别名   配置使用url是不方便记忆的,也不便于统一管理和维护。将如某个数据源的url地址需要改动,那很多任务都要受牵连。为了解决这个问题,fire框架支持将数据源的url定义别名,效果如下所示: ```scala // 直接使用url @Hive("thrift://localhost:9083") @HBase("localhost:2181") // 使用别名 @Hive("batch") @HBase("test") ``` 建议将别名统一定义到 *[cluster.properties](..//fire-core/src/main/resources/cluster.properties)* 配置文件中,以下分别举几个例子说明如何定义数据源别名: ```properties # 定义hbase集群连接信息别名为test,代码中hbase配置简化为:@HBase("test") fire.hbase.cluster.map.test=zk01:2181,zk02:2181,zk03:2181 # kafka集群名称与集群地址映射mq,代码中kafka配置简化为:@Kafka(brokers = "mq", topics = "fire", groupId = "fire") fire.kafka.cluster.map.mq=kafka01:9092,kafka02:9092,kafka03:9092 # hive metastore地址定义别名为batch,则代码中配置简化为:@Hive("batch") fire.hive.cluster.map.batch=thrift://thrift01:9083,thrift://thrift02:9083 ``` ### 5. 配置优先级 Fire框架提供了很多种参数配置的方式,总结下来相同key的配置优先级如下: ***fire.properties < cluster.properties < 配置中心通用配置 < spark.properties|flink.properties < spark-core.properties|spark-streaming.properties|structured-streaming.properties|flink-streaming.properties|flink-batch.properties < common.properties < 注解配置方式 < 用户配置文件 < 配置中心紧急配置*** ### 6. fire内置配置文件 Fire框架内置了多个配置文件,用于应对多种引擎场景,分别是: - **fire.properties**:该配置文件中fire框架的总配置文件,位于fire-core包中,其中的配置主要是针对fire框架的,不含有spark或flink引擎的配置 - **cluster.properties**:该配置文件用于存放各公司集群地址相关的映射信息,由于集群地址信息比较敏感,因此单独拿出来作为一个配置文件 - **spark.properties**:该配置文件是spark引擎的总配置文件,位于fire-spark包中,作为spark引擎任务的总配置文件 - **spark-core.properties**:该配置文件位于fire-spark包中,该配置文件用于配置spark core任务 - **spark-streaming.properties**:该配置文件位于fire-spark包中,主要用于spark streaming任务 - **structured-streaming.properties**:该配置文件位于fire-spark包中,用于进行structured streaming任务的配置 - **flink.properties**:该配置文件位于fire-flink包中,作为flink引擎的总配置文件 - **flink-streaming.properties**:该配置文件位于fire-flink包中,用于配置flink streaming任务 - **flink-batch.properties**:该配置文件位于fire-flink包中,用于配置flink批处理任务 ================================================ FILE: docs/dev/deploy-script.md ================================================ ## 任务提交脚本 ### 一、Flink on yarn ***说明:**强烈建议使用flink的run application模式提交任务,在run application模式下可以使用fire更多的功能,包括restful接口,配置管理等,便于与实时平台集成,per-job模式不推荐使用。* ```shell #!/bin/bash # author: wangchenglong # date: 2022-06-30 13:10:13 # desc:提交flink任务通用脚本 # usage:./deploy.sh com.zto.fire.examples.flink.Test export FLINK_HOME=/opt/flink-1.14.3 export PATH=$FLINK_HOME/bin:$PATH # 以run application模式提交flink任务到yarn上 flink run-application -t yarn-application \ # 使用run-application模式提交,让flink任务与实时平台具有交互能力 -D taskmanager.memory.process.size=4g \ -D state.checkpoints.dir=hdfs:///user/flink/checkpoint/fire \ -D flink.stream.checkpoint.interval=6000 \ -D fire.shutdown.auto.exit=true \ # 可通过-D方式指定flink引擎参数、fire框架参数或用户自定义参数,代码中通过this.conf.get获取参数值 --allowNonRestoredState \ -s hdfs:/user/flink/checkpoint/xxx/chk-5/_metadata \ # 指定checkpoint路径 -ynm fire_test -yqu root.default -ynm test -ys 1 -ytm 2g -c $1 zto-flink*.jar $* ``` ### 二、Spark on yarn ```shell #!/bin/bash # author: wangchenglong # date: 2022-06-30 13:24:13 # desc:提交spark任务通用脚本 # usage:./deploy.sh com.zto.fire.examples.spark.Test export SPARK_HOME=/opt/spark3.0.2 export PATH=$SPARK_HOME/bin:$PATH # 以cluster模式提交spark任务到yarn上 spark-submit \ --master yarn --deploy-mode cluster --class $1 --num-executors 20 --executor-cores 1 \ --driver-memory 1g --executor-memory 1g \ --conf fire.shutdown.auto.exit=true \ # 可通过--conf方式指定spark引擎参数、fire框架参数或用户自定义参数,通过this.conf.get获取参数值 ./zto-spark*.jar $* ``` ================================================ FILE: docs/dev/engine-env.md ================================================ # 依赖管理   Fire框架中很多依赖的scope指定为provided,好处是避免jar包冲突、避免jar包过于臃肿。带来的问题是会引擎依赖找不到(class not found)异常。解决这个问题的方案有两个,一个是将fire中使用到的生命周期为provided的依赖改成compile(任务的pom.xml中指定),一个是将相应依赖的jar包放到spark或flink的lib目录下。本文档选择的是第二个方案,将缺失的依赖放到引擎部署目录的lib下。缺失的jar包可以从[网盘下载](https://pan.baidu.com/s/16kUGQIj2gQjWZdbmuxuyXw?pwd=fire)、[*maven中央仓库*](http://mvnrepository.com/)或本地仓库搜索找到。依赖列表如下: ## 一、Flink on yarn环境 ***说明:**下面的jar包列表以flink1.14.3为例* ```shell [fire@node01 lib]$ pwd /home/fire/opt/flink-1.14.3/lib [fire@node01 lib]$ ll 总用量 307580 -rwxr-xr-x 1 fire fire 1112191 2月 17 13:43 antlr-3.4.jar -rwxr-xr-x 1 fire fire 164368 2月 17 13:43 antlr-runtime-3.4.jar -rwxr-xr-x 1 fire fire 7685845 1月 26 13:47 flink-connector-hive_2.12-1.14.3.jar -rwxr-xr-x 1 fire fire 255974 4月 21 14:31 flink-connector-jdbc_2.12-1.14.3.jar -rwxr-xr-x 1 fire fire 250201 2月 16 18:27 flink-connector-jdbc_2.12-1.14.3.jar.bak -rwxr-xr-x 1 fire fire 389763 4月 18 15:30 flink-connector-kafka_2.12-1.14.3.jar -rwxr-xr-x 1 fire fire 85584 1月 11 07:42 flink-csv-1.14.3.jar -rwxr-xr-x 1 fire fire 136054094 1月 11 07:45 flink-dist_2.12-1.14.3.jar -rwxr-xr-x 1 fire fire 78159 2月 14 13:20 flink-hadoop-compatibility_2.12-1.14.2.jar -rwxr-xr-x 1 fire fire 153145 1月 11 07:42 flink-json-1.14.3.jar -rwxr-xr-x 1 fire fire 757120 2月 14 13:23 flink-scala_2.12-1.14.3.jar -rwxr-xr-x 1 fire fire 24051799 2月 16 17:42 flink-shaded-hadoop-2-2.6.5-9.0.jar -rwxr-xr-x 1 fire fire 7709731 8月 22 2021 flink-shaded-zookeeper-3.4.14.jar -rwxr-xr-x 1 fire fire 39633410 1月 11 07:45 flink-table_2.12-1.14.3.jar -rwxr-xr-x 1 fire fire 119407 2月 16 18:34 flink-table-api-java-bridge_2.12-1.14.3.jar -rwxr-xr-x 1 fire fire 79018 2月 16 18:35 flink-table-api-scala_2.12-1.14.3.jar -rwxr-xr-x 1 fire fire 50197 2月 16 18:35 flink-table-api-scala-bridge_2.12-1.14.3.jar -rwxr-xr-x 1 fire fire 928547 2月 16 18:34 flink-table-common-1.14.3.jar -rwxr-xr-x 1 fire fire 35774469 2月 16 18:34 flink-table-planner_2.12-1.14.3.jar -rwxr-xr-x 1 fire fire 241622 2月 17 09:29 gson-2.8.5.jar -rwxr-xr-x 1 fire fire 2256213 2月 16 18:42 guava-18.0.jar -rwxr-xr-x 1 fire fire 9808 2月 17 09:24 hadoop-client-2.6.0-cdh5.12.1.jar -rwxr-xr-x 1 fire fire 3539744 2月 17 09:24 hadoop-common-2.6.0-cdh5.12.1.jar -rwxr-xr-x 1 fire fire 1765829 2月 17 09:24 hadoop-core-2.6.0-mr1-cdh5.12.1.jar -rwxr-xr-x 1 fire fire 11550546 2月 17 09:24 hadoop-hdfs-2.6.0-cdh5.12.1.jar -rwxr-xr-x 1 fire fire 1321508 2月 16 21:18 hbase-client-1.2.0-cdh5.12.1.jar -rwxr-xr-x 1 fire fire 585568 2月 16 21:18 hbase-common-1.2.0-cdh5.12.1.jar -rwxr-xr-x 1 fire fire 4618035 2月 16 21:18 hbase-protocol-1.2.0-cdh5.12.1.jar -rwxr-xr-x 1 fire fire 292289 2月 18 17:39 hive-common-1.2.1.jar -rwxr-xr-x 1 fire fire 20599029 2月 18 17:39 hive-exec-1.2.1.jar -rwxr-xr-x 1 fire fire 5505100 2月 18 17:39 hive-metastore-1.2.1.jar -rwxr-xr-x 1 fire fire 95806 2月 16 18:46 javax.servlet-api-3.1.0.jar -rwxr-xr-x 1 fire fire 201124 2月 17 09:29 jdo-api-3.0.1.jar -rwxr-xr-x 1 fire fire 3269712 2月 16 18:33 kafka-clients-2.4.1.jar -rwxr-xr-x 1 fire fire 275186 2月 17 09:29 libfb303-0.9.0.jar -rwxr-xr-x 1 fire fire 208006 1月 9 04:13 log4j-1.2-api-2.17.1.jar -rwxr-xr-x 1 fire fire 301872 1月 9 04:13 log4j-api-2.17.1.jar -rwxr-xr-x 1 fire fire 1790452 1月 9 04:13 log4j-core-2.17.1.jar -rwxr-xr-x 1 fire fire 24279 1月 9 04:13 log4j-slf4j-impl-2.17.1.jar -rwxr-xr-x 1 fire fire 82123 2月 16 21:25 metrics-core-2.2.0.jar -rwxr-xr-x 1 fire fire 992805 4月 21 15:15 mysql-connector-java-5.1.41.jar ``` ## 二、Spark on yarn环境 ***说明:**下面的jar包列表以spark3.0.2为例*。[网盘下载](https://pan.baidu.com/s/16kUGQIj2gQjWZdbmuxuyXw?pwd=fire) ```shell [fire@node01 jars]$ pwd /home/fire/opt/spark3.0.2/jars [fire@node01 jars]$ ll 总用量 216276 -rw-r--r-- 1 fire fire 69409 2月 16 2021 activation-1.1.1.jar -rw-r--r-- 1 fire fire 134044 2月 16 2021 aircompressor-0.10.jar -rw-r--r-- 1 fire fire 1168113 2月 16 2021 algebra_2.12-2.0.0-M2.jar -rw-r--r-- 1 fire fire 445288 2月 16 2021 antlr-2.7.7.jar -rw-r--r-- 1 fire fire 336803 2月 16 2021 antlr4-runtime-4.7.1.jar -rw-r--r-- 1 fire fire 164368 2月 16 2021 antlr-runtime-3.4.jar -rw-r--r-- 1 fire fire 4467 2月 16 2021 aopalliance-1.0.jar -rw-r--r-- 1 fire fire 27006 2月 16 2021 aopalliance-repackaged-2.6.1.jar -rw-r--r-- 1 fire fire 44925 2月 16 2021 apacheds-i18n-2.0.0-M15.jar -rw-r--r-- 1 fire fire 691479 2月 16 2021 apacheds-kerberos-codec-2.0.0-M15.jar -rw-r--r-- 1 fire fire 448794 2月 16 2021 apache-log4j-extras-1.2.17.jar -rw-r--r-- 1 fire fire 16560 2月 16 2021 api-asn1-api-1.0.0-M20.jar -rw-r--r-- 1 fire fire 79912 2月 16 2021 api-util-1.0.0-M20.jar -rw-r--r-- 1 fire fire 1194003 2月 16 2021 arpack_combined_all-0.1.jar -rw-r--r-- 1 fire fire 64674 2月 16 2021 arrow-format-0.15.1.jar -rw-r--r-- 1 fire fire 105777 2月 16 2021 arrow-memory-0.15.1.jar -rw-r--r-- 1 fire fire 1437215 2月 16 2021 arrow-vector-0.15.1.jar -rw-r--r-- 1 fire fire 20437 2月 16 2021 audience-annotations-0.5.0.jar -rw-r--r-- 1 fire fire 176285 2月 16 2021 automaton-1.11-8.jar -rw-r--r-- 1 fire fire 1556863 2月 16 2021 avro-1.8.2.jar -rw-r--r-- 1 fire fire 132989 2月 16 2021 avro-ipc-1.8.2.jar -rw-r--r-- 1 fire fire 187052 2月 16 2021 avro-mapred-1.8.2-hadoop2.jar -rw-r--r-- 1 fire fire 110600 2月 16 2021 bonecp-0.8.0.RELEASE.jar -rw-r--r-- 1 fire fire 13826799 2月 16 2021 breeze_2.12-1.0.jar -rw-r--r-- 1 fire fire 134696 2月 16 2021 breeze-macros_2.12-1.0.jar -rw-r--r-- 1 fire fire 3226851 2月 16 2021 cats-kernel_2.12-2.0.0-M4.jar -rw-r--r-- 1 fire fire 211523 2月 16 2021 chill_2.12-0.9.5.jar -rw-r--r-- 1 fire fire 58684 2月 16 2021 chill-java-0.9.5.jar -rw-r--r-- 1 fire fire 246918 2月 16 2021 commons-beanutils-1.9.4.jar -rw-r--r-- 1 fire fire 41123 2月 16 2021 commons-cli-1.2.jar -rw-r--r-- 1 fire fire 284184 2月 16 2021 commons-codec-1.10.jar -rw-r--r-- 1 fire fire 588337 2月 16 2021 commons-collections-3.2.2.jar -rw-r--r-- 1 fire fire 71626 2月 16 2021 commons-compiler-3.0.16.jar -rw-r--r-- 1 fire fire 632424 2月 16 2021 commons-compress-1.20.jar -rw-r--r-- 1 fire fire 298829 2月 16 2021 commons-configuration-1.6.jar -rw-r--r-- 1 fire fire 166244 2月 16 2021 commons-crypto-1.1.0.jar -rw-r--r-- 1 fire fire 160519 2月 16 2021 commons-dbcp-1.4.jar -rw-r--r-- 1 fire fire 143602 2月 16 2021 commons-digester-1.8.jar -rw-r--r-- 1 fire fire 305001 2月 16 2021 commons-httpclient-3.1.jar -rw-r--r-- 1 fire fire 185140 2月 16 2021 commons-io-2.4.jar -rw-r--r-- 1 fire fire 284220 2月 16 2021 commons-lang-2.6.jar -rw-r--r-- 1 fire fire 503880 2月 16 2021 commons-lang3-3.9.jar -rw-r--r-- 1 fire fire 62050 2月 16 2021 commons-logging-1.1.3.jar -rw-r--r-- 1 fire fire 2035066 2月 16 2021 commons-math3-3.4.1.jar -rw-r--r-- 1 fire fire 273370 2月 16 2021 commons-net-3.1.jar -rw-r--r-- 1 fire fire 96221 2月 16 2021 commons-pool-1.5.4.jar -rw-r--r-- 1 fire fire 197176 2月 16 2021 commons-text-1.6.jar -rw-r--r-- 1 fire fire 79845 2月 16 2021 compress-lzf-1.0.3.jar -rw-r--r-- 1 fire fire 164422 2月 16 2021 core-1.1.2.jar -rw-r--r-- 1 fire fire 69500 2月 16 2021 curator-client-2.7.1.jar -rw-r--r-- 1 fire fire 186273 2月 16 2021 curator-framework-2.7.1.jar -rw-r--r-- 1 fire fire 270342 2月 16 2021 curator-recipes-2.7.1.jar -rw-r--r-- 1 fire fire 339666 2月 16 2021 datanucleus-api-jdo-3.2.6.jar -rw-r--r-- 1 fire fire 1890075 2月 16 2021 datanucleus-core-3.2.10.jar -rw-r--r-- 1 fire fire 1809447 2月 16 2021 datanucleus-rdbms-3.2.9.jar -rw-r--r-- 1 fire fire 3224708 2月 16 2021 derby-10.12.1.1.jar -rw-r--r-- 1 fire fire 18497 2月 16 2021 flatbuffers-java-1.9.0.jar -rw-r--r-- 1 fire fire 14395 2月 16 2021 generex-1.0.2.jar -rw-r--r-- 1 fire fire 190432 2月 16 2021 gson-2.2.4.jar -rw-r--r-- 1 fire fire 2189117 2月 16 2021 guava-14.0.1.jar -rw-r--r-- 1 fire fire 710492 2月 16 2021 guice-3.0.jar -rw-r--r-- 1 fire fire 65012 2月 16 2021 guice-servlet-3.0.jar -rw-r--r-- 1 fire fire 41094 2月 16 2021 hadoop-annotations-2.7.4.jar -rw-r--r-- 1 fire fire 94621 2月 16 2021 hadoop-auth-2.7.4.jar -rw-r--r-- 1 fire fire 26243 2月 16 2021 hadoop-client-2.7.4.jar -rw-r--r-- 1 fire fire 3499224 2月 16 2021 hadoop-common-2.7.4.jar -rw-r--r-- 1 fire fire 8350471 2月 16 2021 hadoop-hdfs-2.7.4.jar -rw-r--r-- 1 fire fire 543852 2月 16 2021 hadoop-mapreduce-client-app-2.7.4.jar -rw-r--r-- 1 fire fire 776862 2月 16 2021 hadoop-mapreduce-client-common-2.7.4.jar -rw-r--r-- 1 fire fire 1558288 2月 16 2021 hadoop-mapreduce-client-core-2.7.4.jar -rw-r--r-- 1 fire fire 62960 2月 16 2021 hadoop-mapreduce-client-jobclient-2.7.4.jar -rw-r--r-- 1 fire fire 72050 2月 16 2021 hadoop-mapreduce-client-shuffle-2.7.4.jar -rw-r--r-- 1 fire fire 2039372 2月 16 2021 hadoop-yarn-api-2.7.4.jar -rw-r--r-- 1 fire fire 166121 2月 16 2021 hadoop-yarn-client-2.7.4.jar -rw-r--r-- 1 fire fire 1679789 2月 16 2021 hadoop-yarn-common-2.7.4.jar -rw-r--r-- 1 fire fire 388572 2月 16 2021 hadoop-yarn-server-common-2.7.4.jar -rw-r--r-- 1 fire fire 58699 2月 16 2021 hadoop-yarn-server-web-proxy-2.7.4.jar -rw-r--r-- 1 fire fire 138464 2月 16 2021 hive-beeline-1.2.1.spark2.jar -rw-r--r-- 1 fire fire 40817 2月 16 2021 hive-cli-1.2.1.spark2.jar -rw-r--r-- 1 fire fire 11498852 2月 16 2021 hive-exec-1.2.1.spark2.jar -rw-r--r-- 1 fire fire 100680 2月 16 2021 hive-jdbc-1.2.1.spark2.jar -rw-r--r-- 1 fire fire 5505200 2月 16 2021 hive-metastore-1.2.1.spark2.jar -rw-r--r-- 1 fire fire 200223 2月 16 2021 hk2-api-2.6.1.jar -rw-r--r-- 1 fire fire 203358 2月 16 2021 hk2-locator-2.6.1.jar -rw-r--r-- 1 fire fire 131590 2月 16 2021 hk2-utils-2.6.1.jar -rw-r--r-- 1 fire fire 1475955 2月 16 2021 htrace-core-3.1.0-incubating.jar -rw-r--r-- 1 fire fire 767140 2月 16 2021 httpclient-4.5.6.jar -rw-r--r-- 1 fire fire 328347 2月 16 2021 httpcore-4.4.12.jar -rw-r--r-- 1 fire fire 27156 2月 16 2021 istack-commons-runtime-3.0.8.jar -rw-r--r-- 1 fire fire 1282424 2月 16 2021 ivy-2.4.0.jar -rw-r--r-- 1 fire fire 67889 2月 16 2021 jackson-annotations-2.10.0.jar -rw-r--r-- 1 fire fire 348635 2月 16 2021 jackson-core-2.10.0.jar -rw-r--r-- 1 fire fire 232248 2月 16 2021 jackson-core-asl-1.9.13.jar -rw-r--r-- 1 fire fire 1400944 2月 16 2021 jackson-databind-2.10.0.jar -rw-r--r-- 1 fire fire 46646 2月 16 2021 jackson-dataformat-yaml-2.10.0.jar -rw-r--r-- 1 fire fire 105898 2月 16 2021 jackson-datatype-jsr310-2.10.3.jar -rw-r--r-- 1 fire fire 18336 2月 16 2021 jackson-jaxrs-1.9.13.jar -rw-r--r-- 1 fire fire 780664 2月 16 2021 jackson-mapper-asl-1.9.13.jar -rw-r--r-- 1 fire fire 34991 2月 16 2021 jackson-module-jaxb-annotations-2.10.0.jar -rw-r--r-- 1 fire fire 43740 2月 16 2021 jackson-module-paranamer-2.10.0.jar -rw-r--r-- 1 fire fire 341862 2月 16 2021 jackson-module-scala_2.12-2.10.0.jar -rw-r--r-- 1 fire fire 27084 2月 16 2021 jackson-xc-1.9.13.jar -rw-r--r-- 1 fire fire 44399 2月 16 2021 jakarta.activation-api-1.2.1.jar -rw-r--r-- 1 fire fire 25058 2月 16 2021 jakarta.annotation-api-1.3.5.jar -rw-r--r-- 1 fire fire 18140 2月 16 2021 jakarta.inject-2.6.1.jar -rw-r--r-- 1 fire fire 91930 2月 16 2021 jakarta.validation-api-2.0.2.jar -rw-r--r-- 1 fire fire 140376 2月 16 2021 jakarta.ws.rs-api-2.1.6.jar -rw-r--r-- 1 fire fire 115498 2月 16 2021 jakarta.xml.bind-api-2.3.2.jar -rw-r--r-- 1 fire fire 926574 2月 16 2021 janino-3.0.16.jar -rw-r--r-- 1 fire fire 16993 2月 16 2021 JavaEWAH-0.3.2.jar -rw-r--r-- 1 fire fire 780265 2月 16 2021 javassist-3.25.0-GA.jar -rw-r--r-- 1 fire fire 2497 2月 16 2021 javax.inject-1.jar -rw-r--r-- 1 fire fire 95806 2月 16 2021 javax.servlet-api-3.1.0.jar -rw-r--r-- 1 fire fire 395195 2月 16 2021 javolution-5.5.1.jar -rw-r--r-- 1 fire fire 105134 2月 16 2021 jaxb-api-2.2.2.jar -rw-r--r-- 1 fire fire 1013367 2月 16 2021 jaxb-runtime-2.3.2.jar -rw-r--r-- 1 fire fire 16537 2月 16 2021 jcl-over-slf4j-1.7.30.jar -rw-r--r-- 1 fire fire 201124 2月 16 2021 jdo-api-3.0.1.jar -rw-r--r-- 1 fire fire 244502 2月 16 2021 jersey-client-2.30.jar -rw-r--r-- 1 fire fire 1166647 2月 16 2021 jersey-common-2.30.jar -rw-r--r-- 1 fire fire 32091 2月 16 2021 jersey-container-servlet-2.30.jar -rw-r--r-- 1 fire fire 73349 2月 16 2021 jersey-container-servlet-core-2.30.jar -rw-r--r-- 1 fire fire 76733 2月 16 2021 jersey-hk2-2.30.jar -rw-r--r-- 1 fire fire 85815 2月 16 2021 jersey-media-jaxb-2.30.jar -rw-r--r-- 1 fire fire 927721 2月 16 2021 jersey-server-2.30.jar -rw-r--r-- 1 fire fire 539912 2月 16 2021 jetty-6.1.26.jar -rw-r--r-- 1 fire fire 18891 2月 16 2021 jetty-sslengine-6.1.26.jar -rw-r--r-- 1 fire fire 177131 2月 16 2021 jetty-util-6.1.26.jar -rw-r--r-- 1 fire fire 232470 2月 16 2021 JLargeArrays-1.5.jar -rw-r--r-- 1 fire fire 268780 2月 16 2021 jline-2.14.6.jar -rw-r--r-- 1 fire fire 643043 2月 16 2021 joda-time-2.10.5.jar -rw-r--r-- 1 fire fire 427780 2月 16 2021 jodd-core-3.5.2.jar -rw-r--r-- 1 fire fire 12131 2月 16 2021 jpam-1.1.jar -rw-r--r-- 1 fire fire 83632 2月 16 2021 json4s-ast_2.12-3.6.6.jar -rw-r--r-- 1 fire fire 482486 2月 16 2021 json4s-core_2.12-3.6.6.jar -rw-r--r-- 1 fire fire 36175 2月 16 2021 json4s-jackson_2.12-3.6.6.jar -rw-r--r-- 1 fire fire 349025 2月 16 2021 json4s-scalap_2.12-3.6.6.jar -rw-r--r-- 1 fire fire 100636 2月 16 2021 jsp-api-2.1.jar -rw-r--r-- 1 fire fire 33031 2月 16 2021 jsr305-3.0.0.jar -rw-r--r-- 1 fire fire 15071 2月 16 2021 jta-1.1.jar -rw-r--r-- 1 fire fire 1175798 2月 16 2021 JTransforms-3.1.jar -rw-r--r-- 1 fire fire 4592 2月 16 2021 jul-to-slf4j-1.7.30.jar -rw-r--r-- 1 fire fire 410874 2月 16 2021 kryo-shaded-4.0.2.jar -rw-r--r-- 1 fire fire 775174 2月 16 2021 kubernetes-client-4.9.2.jar -rw-r--r-- 1 fire fire 11908731 2月 16 2021 kubernetes-model-4.9.2.jar -rw-r--r-- 1 fire fire 3954 2月 16 2021 kubernetes-model-common-4.9.2.jar -rw-r--r-- 1 fire fire 1045744 2月 16 2021 leveldbjni-all-1.8.jar -rw-r--r-- 1 fire fire 313702 2月 16 2021 libfb303-0.9.3.jar -rw-r--r-- 1 fire fire 246445 2月 16 2021 libthrift-0.12.0.jar -rw-r--r-- 1 fire fire 489884 2月 16 2021 log4j-1.2.17.jar -rw-r--r-- 1 fire fire 12488 2月 16 2021 logging-interceptor-3.12.6.jar -rw-r--r-- 1 fire fire 649950 2月 16 2021 lz4-java-1.7.1.jar -rw-r--r-- 1 fire fire 33786 2月 16 2021 machinist_2.12-0.6.8.jar -rw-r--r-- 1 fire fire 3180 2月 16 2021 macro-compat_2.12-1.1.1.jar -rw-r--r-- 1 fire fire 7343426 2月 16 2021 mesos-1.4.0-shaded-protobuf.jar -rw-r--r-- 1 fire fire 105365 2月 16 2021 metrics-core-4.1.1.jar -rw-r--r-- 1 fire fire 22042 2月 16 2021 metrics-graphite-4.1.1.jar -rw-r--r-- 1 fire fire 20889 2月 16 2021 metrics-jmx-4.1.1.jar -rw-r--r-- 1 fire fire 16642 2月 16 2021 metrics-json-4.1.1.jar -rw-r--r-- 1 fire fire 23909 2月 16 2021 metrics-jvm-4.1.1.jar -rw-r--r-- 1 fire fire 5711 2月 16 2021 minlog-1.3.0.jar -rw-r--r-- 1 fire fire 4153218 2月 16 2021 netty-all-4.1.47.Final.jar -rw-r--r-- 1 fire fire 54391 2月 16 2021 objenesis-2.5.1.jar -rw-r--r-- 1 fire fire 423175 2月 16 2021 okhttp-3.12.6.jar -rw-r--r-- 1 fire fire 88732 2月 16 2021 okio-1.15.0.jar -rw-r--r-- 1 fire fire 19827 2月 16 2021 opencsv-2.3.jar -rw-r--r-- 1 fire fire 1580620 2月 16 2021 orc-core-1.5.10-nohive.jar -rw-r--r-- 1 fire fire 814061 2月 16 2021 orc-mapreduce-1.5.10-nohive.jar -rw-r--r-- 1 fire fire 27749 2月 16 2021 orc-shims-1.5.10.jar -rw-r--r-- 1 fire fire 65261 2月 16 2021 oro-2.0.8.jar -rw-r--r-- 1 fire fire 19479 2月 16 2021 osgi-resource-locator-1.0.3.jar -rw-r--r-- 1 fire fire 34654 2月 16 2021 paranamer-2.8.jar -rw-r--r-- 1 fire fire 1097799 2月 16 2021 parquet-column-1.10.1.jar -rw-r--r-- 1 fire fire 94995 2月 16 2021 parquet-common-1.10.1.jar -rw-r--r-- 1 fire fire 848750 2月 16 2021 parquet-encoding-1.10.1.jar -rw-r--r-- 1 fire fire 723203 2月 16 2021 parquet-format-2.4.0.jar -rw-r--r-- 1 fire fire 285732 2月 16 2021 parquet-hadoop-1.10.1.jar -rw-r--r-- 1 fire fire 2796935 2月 16 2021 parquet-hadoop-bundle-1.6.0.jar -rw-r--r-- 1 fire fire 1048171 2月 16 2021 parquet-jackson-1.10.1.jar -rw-r--r-- 1 fire fire 533455 2月 16 2021 protobuf-java-2.5.0.jar -rw-r--r-- 1 fire fire 123052 2月 16 2021 py4j-0.10.9.jar -rw-r--r-- 1 fire fire 100431 2月 16 2021 pyrolite-4.30.jar -rw-r--r-- 1 fire fire 325335 2月 16 2021 RoaringBitmap-0.7.45.jar -rw-r--r-- 1 fire fire 112235 2月 16 2021 scala-collection-compat_2.12-2.1.1.jar -rw-r--r-- 1 fire fire 10672015 2月 16 2021 scala-compiler-2.12.10.jar -rw-r--r-- 1 fire fire 5276900 2月 16 2021 scala-library-2.12.10.jar -rw-r--r-- 1 fire fire 222980 2月 16 2021 scala-parser-combinators_2.12-1.1.2.jar -rw-r--r-- 1 fire fire 3678534 2月 16 2021 scala-reflect-2.12.10.jar -rw-r--r-- 1 fire fire 556575 2月 16 2021 scala-xml_2.12-1.2.0.jar -rw-r--r-- 1 fire fire 3243337 2月 16 2021 shapeless_2.12-2.3.3.jar -rw-r--r-- 1 fire fire 4028 2月 16 2021 shims-0.7.45.jar -rw-r--r-- 1 fire fire 41472 2月 16 2021 slf4j-api-1.7.30.jar -rw-r--r-- 1 fire fire 12211 2月 16 2021 slf4j-log4j12-1.7.30.jar -rw-r--r-- 1 fire fire 302558 2月 16 2021 snakeyaml-1.24.jar -rw-r--r-- 1 fire fire 48720 2月 16 2021 snappy-0.2.jar -rw-r--r-- 1 fire fire 1969177 2月 16 2021 snappy-java-1.1.8.2.jar -rw-r--r-- 1 fire fire 9409634 2月 16 2021 spark-catalyst_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 9880087 2月 16 2021 spark-core_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 430762 2月 16 2021 spark-graphx_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 693694 2月 16 2021 spark-hive_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 1886671 2月 16 2021 spark-hive-thriftserver_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 374948 2月 16 2021 spark-kubernetes_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 59868 2月 16 2021 spark-kvstore_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 75937 2月 16 2021 spark-launcher_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 295158 2月 16 2021 spark-mesos_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 5887713 2月 16 2021 spark-mllib_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 111921 2月 16 2021 spark-mllib-local_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 2397705 2月 16 2021 spark-network-common_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 86942 2月 16 2021 spark-network-shuffle_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 52496 2月 16 2021 spark-repl_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 30353 2月 16 2021 spark-sketch_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 7160215 2月 16 2021 spark-sql_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 1138146 2月 16 2021 spark-streaming_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 15155 2月 16 2021 spark-tags_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 10375 2月 16 2021 spark-tags_2.12-3.0.2-tests.jar -rw-r--r-- 1 fire fire 51308 2月 16 2021 spark-unsafe_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 331837 7月 28 2021 spark-yarn_2.12-3.0.2.jar -rw-r--r-- 1 fire fire 331935 2月 16 2021 spark-yarn_2.12-3.0.2.jar.bak -rw-r--r-- 1 fire fire 7188024 2月 16 2021 spire_2.12-0.17.0-M1.jar -rw-r--r-- 1 fire fire 79588 2月 16 2021 spire-macros_2.12-0.17.0-M1.jar -rw-r--r-- 1 fire fire 8261 2月 16 2021 spire-platform_2.12-0.17.0-M1.jar -rw-r--r-- 1 fire fire 34601 2月 16 2021 spire-util_2.12-0.17.0-M1.jar -rw-r--r-- 1 fire fire 236660 2月 16 2021 ST4-4.0.4.jar -rw-r--r-- 1 fire fire 26514 2月 16 2021 stax-api-1.0.1.jar -rw-r--r-- 1 fire fire 23346 2月 16 2021 stax-api-1.0-2.jar -rw-r--r-- 1 fire fire 178149 2月 16 2021 stream-2.9.6.jar -rw-r--r-- 1 fire fire 148627 2月 16 2021 stringtemplate-3.2.1.jar -rw-r--r-- 1 fire fire 93210 2月 16 2021 super-csv-2.2.0.jar -rw-r--r-- 1 fire fire 233745 2月 16 2021 threeten-extra-1.5.0.jar -rw-r--r-- 1 fire fire 443986 2月 16 2021 univocity-parsers-2.9.0.jar -rw-r--r-- 1 fire fire 281356 2月 16 2021 xbean-asm7-shaded-4.15.jar -rw-r--r-- 1 fire fire 1386397 2月 16 2021 xercesImpl-2.12.0.jar -rw-r--r-- 1 fire fire 220536 2月 16 2021 xml-apis-1.4.01.jar -rw-r--r-- 1 fire fire 15010 2月 16 2021 xmlenc-0.52.jar -rw-r--r-- 1 fire fire 99555 2月 16 2021 xz-1.5.jar -rw-r--r-- 1 fire fire 35518 2月 16 2021 zjsonpatch-0.3.0.jar -rw-r--r-- 1 fire fire 911603 2月 16 2021 zookeeper-3.4.14.jar -rw-r--r-- 1 fire fire 4210625 2月 16 2021 zstd-jni-1.4.4-3.jar ``` ================================================ FILE: docs/dev/integration.md ================================================ ### 一、编译与安装 ```shell # git clone https://github.com/ZTO-Express/fire.git # mvn clean install -DskipTests -Pspark-3.0.2 -Pflink-1.14.3 -Pscala-2.12 ``` 建议将fire deploy到maven私服,便于每个人去使用。编译fire框架时,可根据实际需求编译成指定版本的spark或flink。官方适配的版本是如下: | Apache Spark | Apache Spark | | ------------ | ------------ | | 2.3.x | 1.10.x | | 2.4.x | 1.11.x | | 3.0.x | 1.12.x | | 3.1.x | 1.13.x | | 3.2.x | 1.14.x | | 3.3.x | 1.15.x | ### 二、maven依赖 - [spark项目pom样例](pom/spark-pom.xml) - [flink项目pom样例](pom/flink-pom.xml) ### 三、开发步骤 Fire框架提供了统一的编码风格,基于这种编码风格,可以很轻松的进行spark或flink代码开发。 ```scala package com.zto.fire.examples.flink import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.core.anno._ import com.zto.fire.flink.BaseFlinkStreaming import com.zto.fire.flink.anno.Checkpoint /** * 基于Fire进行Flink Streaming开发 * * @contact Fire框架技术交流群(钉钉):35373471 */ @Config( """ |# 支持Flink调优参数、Fire框架参数、用户自定义参数等 |state.checkpoints.num-retained=30 |state.checkpoints.dir=hdfs:///user/flink/checkpoint |""") @Hive("thrift://localhost:9083") // 配置连接到指定的hive @Checkpoint(interval = 100, unaligned = true) // 100s做一次checkpoint,开启非对齐checkpoint @Kafka(brokers = "localhost:9092", topics = "fire", groupId = "fire") object FlinkDemo extends BaseFlinkStreaming { /** process方法中编写业务逻辑代码,该方法会被fire框架自动调起 **/ override def process: Unit = { val dstream = this.fire.createKafkaDirectStream() // 使用api的方式消费kafka this.fire.sql("""create table statement ...""") this.fire.sql("""insert into statement ...""") this.fire.start } } ``` 从以上代码片段中可以看到,引入fire框架大体分为5个步骤: #### 3.1 隐式转换 无论是spark还是flink任务,都需要引入以下的隐式转换,该隐式转换提供了众多简单易用的api。 ```scala import com.zto.fire._ ``` #### 3.2 继承父类 Fire框架针对不同的引擎、不同的场景提供了对应的父类,用户需要根据实际情况去继承: ##### 3.2.1 spark引擎父类列表: - **SparkStreaming**:适用于进行Spark Streaming任务的开发 - **BaseSparkCore**:适用于进行Spark批处理任务的开发 - **BaseStructuredStreaming**:适用于进行Spark Structured Streaming任务的开发 ##### 3.2.2 flink引擎父类列表: - **BaseFlinkStreaming**:适用于进行flink流式计算任务的开发 - **BaseFlinkBatch**:适用于进行flink批处理任务的开发 #### 3.3 业务逻辑 Fire父类中统一约定了process方法,该方法会被fire框架自动调用,用户无需在代码中主动调用该方法。process方法作为业务逻辑的聚集地,是业务逻辑的开始。 ```scala override def process: Unit = { val dstream = this.fire.createKafkaDirectStream() dstream.print // 提交streaming任务 this.fire.start } ``` ***说明:**Fire框架无需编写main方法,无需主动初始化sparksession或flink的environment等对象。这些会被fire框架自动初始化完成,开发者只需在代码中使用this.的方式引用即可。如果有spark或flink调优参数,可以直接复制到@Config注解中,这些调优参数会在fire框架初始化spark或flink引擎上下文时自动生效。* ================================================ FILE: docs/feature.md ================================================ ================================================ FILE: docs/highlight/checkpoint.md ================================================ # Flink Checkpoint动态调优   Flink作为有状态的流式计算引擎,周期性的checkpoint至关重要。checkpoint的周期不宜设置过长或过短,针对不同的任务要区别对待。甚至针对同一个任务,在不同场景下checkpoint过程也会因为**超时**或**反压**等原因导致失败。下面先来看一下传统checkpoint调优所面临的问题: ## 一、传统checkpoint调优痛点   Flink checkpoint**速率**、频率、**超时时间**参数等直接影响了任务的健康度。当flink任务重启时,会因消息积压导致任务反压,任务反压反过来会促使checkpoint变慢甚至是超时。如此一来,仿佛进入了一个恶性循环。 - **静态调整**:flink任务的checkpoint相关参数,必须在任务运行前提前设置好,运行时是没办法动态调整的 - **影响数据时效**:重启任务调整checkpoint,必然带来消息处理的延迟,对于实时性要求非常高的场景,影响很大 - **加剧反压**:重启任务后,会带来数据消费的滞后性,如果任务本身checkpoint耗时比较长,还会因为反压与同时做checkpoint带来性能进一步的恶化 ## 二、基于Fire实现动态调优   Fire框架为Flink checkpoint提供了增强,可以做到运行时动态调整checkpoint的相关参数,达到不重启任务即可实现动态调优的目的。Flink开发者只需集成[集成Fire框架]([ZTO-Express/fire (github.com)](https://github.com/ZTO-Express/fire)) ,就可以在运行时通过调用Fire框架提供的restful接口,从而实现动态调整checkpoint参数的目的了。 ## 三、典型场景 - **大状态任务**   假设线上有这样一个任务,每秒钟处理的消息量非常大,状态非常大,每次checkpoint耗时在5分钟以上。这个任务如果停止10分钟以上,会导致大量的消息积压,而消息积压导致的反压叠加checkpoint,会进一步影响任务的性能。这个时候,可以临时先将checkpoint周期调大,等反压结束后再调整回之前的checkpoint周期,降低了checkpoint耗时较长带来性能下降的影响。 - **临时调整** ​ 不愿停止任务,只是临时性的调整checkpoint周期、超时参数等。 ## 四、集成示例 ```scala @Checkpoint(interval = 100, unaligned = true) // 100s做一次checkpoint,开启非对齐checkpoint @Kafka(brokers = "localhost:9092", topics = "fire", groupId = "fire") object Demo extends BaseFlinkStreaming { override def process: Unit = { val dstream = this.fire.createKafkaDirectStream() // 使用api的方式消费kafka this.fire.sql("""create table statement ...""") this.fire.sql("""insert into statement ...""") this.fire.start } } ``` ## 五、动态调整checkpoint参数 集成了Fire框架的flink任务在运行起来以后,可以在flink的webui的Job Manager -> Configuration中查看到restful接口地址: ![fire-restful](../img/fire-restful.png) 找到接口地址以后,通过curl命令调用该接口即可实现动态调优: ```shell curl -H "Content-Type:application/json" -X POST --data '{"interval":60000,"minPauseBetween": 60000, "timeout": 60000}' http://ip:5753/system/checkpoint ``` 效果如下图所示: ![checkpoint动态调优](../img/checkpoint-duration.png) ================================================ FILE: docs/highlight/spark-duration.md ================================================ # Spark Streaming动态调整批次时间   Spark Streaming作为微批次流式计算引擎,批次的间隔时间可能是最常被调整和使用的参数之一。批次的间隔较小,实效性较好,但吞吐性能会下降。批次的间隔时间较大,实效性较差,但吞吐性能会提高很多。 ## 一、传统Streaming批次时间调整痛点   对于传统的Spark Streaming批次间隔时间调整,一般来说需要修改代码,重启Spark任务。这种方式比较麻烦,灵活性很差,没办法灵活的应对不同的生产场景。比如说电商大促期间,消息量可能是平日里的3倍以上,这个时候往往需要临时调大计算资源或调大Streaming的批次间隔时间来提高吞吐率。如果任务少还好,任务很多的情况下,就显得非常浪费时间了。 ## 二、基于Fire实现动态调优   Fire框架为Spark Streaming提供了增强,可以做到运行时动态调整Streaming的批次间隔时间,达到不重启任务即可实现动态调优的目的。Spark开发者只需集成[集成Fire框架]([ZTO-Express/fire (github.com)](https://github.com/ZTO-Express/fire)) ,就可以在运行时通过调用Fire框架提供的restful接口,从而实现动态调整批次间隔参数的目的了。 ## 三、典型场景 - **提高吞吐率** ​ 动态的调大批次间隔时间,以应对数据洪峰,提高Spark Streaming的吞吐率。 - **临时调整** ​ 不想停止任务,只是临时性的调整Streaming批次间隔时间等。 ## 四、集成示例 ```scala @Streaming(interval = 100, maxRatePerPartition = 100) // 100s一个Streaming batch,并限制消费速率 @Kafka(brokers = "localhost:9092", topics = "fire", groupId = "fire") object SparkDemo extends BaseSparkStreaming { override def process: Unit = { val dstream = this.fire.createKafkaDirectStream() // 使用api的方式消费kafka sql("""select * from xxx""").show() this.fire.start } } ``` ## 五、动态调整批次时间 集成了Fire框架的Spark Streaming任务在运行起来以后,可以在Spark的webui的Environment中查看到restful接口地址: ![streaming-duration](../img/streaming-duration.png) 找到接口地址以后,通过curl命令调用该接口即可实现动态调优: ```shell curl -H "Content-Type:application/json" -X POST --data '{batchDuration: "20",restartSparkContext: "false",stopGracefully: "false"}' http://ip:27466/system/streaming/hotRestart ``` 调用上述接口后,只会重启StreamingContext,SparkContext不会被重启。 ================================================ FILE: docs/index.md ================================================ ## 一、开发手册 ### 1.1 开发与发布 #### [1.1.1 框架集成](dev/integration.md) #### [1.1.2 参数配置](dev/config.md) #### [1.1.3 集群环境](dev/engine-env.md) #### [1.1.4 任务发布](dev/deploy-script.md) ### 1.3 数据源 #### [1.3.1 Kafka Connector](connector/kafka.md) #### [1.3.2 RocketMQ Connector](connector/rocketmq.md) #### [1.3.3 Hive Connector](connector/hive.md) #### [1.3.4 HBase Connector](connector/hbase.md) #### [1.3.5 JDBC Connector](connector/jdbc.md) #### [1.3.6 Oracle Connector](connector/oracle.md) #### [1.3.7 Clickhouse Connector](connector/clickhouse.md) #### [1.3.8 ADB Connector](connector/adb.md) #### [1.3.9 Kudu Connector](#) ### [1.4 累加器](accumulator.md) ### [1.5 定时任务](schedule.md) ### [1.6 线程池与并发计算](threadpool.md) ### [1.7 Spark DataSource增强](datasource.md) ## 二、实时平台建设 ### [2.1 集成方案](platform.md) ### [2.2 内置接口](restful.md) ## 三、配置与调优 ### [3.1 Fire configuration](properties.md) ================================================ FILE: docs/platform.md ================================================ ================================================ FILE: docs/pom/flink-pom.xml ================================================ 4.0.0 com.zto.bigdata.flink flink-demo 1.0-SNAPSHOT ${project.artifactId} 2.3.2-SNAPSHOT 0.9.0 compile 2.12 13 0.11.0.2 2.8.0 2.6.0 1.1.0 1.1.0 org.apache.hive 1.2.0 2.5.30 2.10.5 4.8.0 5.1.30 18.0 UTF-8 ${scala.binary.version}.${scala.minor.version} ${flink.version}_${scala.binary.version} flink-1.12.2 1.12.2 1.12 org.apache.flink flink-table-planner-blink_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-queryable-state-runtime_${scala.binary.version} ${flink.version} ${maven.scope} flink-1.13.0 1.13.0 1.13 org.apache.flink flink-table-planner-blink_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-queryable-state-runtime_${scala.binary.version} ${flink.version} ${maven.scope} flink-1.14.3 true 1.14.3 1.14 org.apache.flink flink-table-planner_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-queryable-state-runtime ${flink.version} ${maven.scope} com.zto.fire fire-connector-flink-clickhouse_${flink.reference} ${fire.version} zto http://maven.dev.ztosys.com/nexus/content/groups/public/ true true aliyun https://maven.aliyun.com/repository/central true true central https://mirrors.huaweicloud.com/repository/maven/ true true com.zto.fire fire-common_${scala.binary.version} ${fire.version} com.zto.fire fire-core_${scala.binary.version} ${fire.version} com.zto.fire fire-flink_${flink.reference} ${fire.version} com.zto.fire fire-enhance-flink_${flink.reference} ${fire.version} com.zto.fire fire-enhance-arthas_${scala.binary.version} ${fire.version} com.zto.fire fire-connector-hbase_${scala.binary.version} ${fire.version} com.zto.fire fire-connector-jdbc_${scala.binary.version} ${fire.version} com.zto.fire fire-connector-flink-rocketmq_${flink.reference} ${fire.version} com.zto.fire fire-metrics_${scala.binary.version} ${fire.version} com.sparkjava spark-core ${sparkjava.version} org.apache.flink flink-java ${flink.version} ${maven.scope} org.apache.flink flink-scala_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-streaming-scala_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-clients_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-runtime-web_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-queryable-state-client-java ${flink.version} ${maven.scope} org.apache.flink flink-statebackend-rocksdb_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-connector-kafka_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.kafka kafka_${scala.binary.version} ${kafka.version} ${maven.scope} org.apache.flink flink-table-api-java-bridge_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-table-api-java ${flink.version} ${maven.scope} org.apache.flink flink-table-api-scala-bridge_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-table-common ${flink.version} ${maven.scope} org.apache.flink flink-connector-hive_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-connector-jdbc_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-json ${flink.version} ${maven.scope} org.apache.flink flink-connector-elasticsearch-base_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-hadoop-compatibility_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-table-planner_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.rocketmq rocketmq-client ${rocketmq.version} org.apache.rocketmq rocketmq-acl ${rocketmq.version} org.apache.flink flink-orc-nohive_${scala.binary.version} ${flink.version} org.apache.flink flink-shaded-hadoop-2-uber 2.6.5-8.0 ${maven.scope} javax.servlet servlet-api org.apache.hive hive-exec ${hive.apache.version} ${maven.scope} calcite-core org.apache.calcite org.apache.hbase hbase-common ${hbase.version} ${maven.scope} org.apache.hbase hbase-client org.apache.hbase hbase-server ${hbase.version} ${maven.scope} org.apache.hbase hbase-client org.apache.hbase hbase-client ${hbase.version} calcite-core org.apache.calcite com.oracle ojdbc6 11.2.0.3 ${maven.scope} com.google.guava guava ${guava.version} true org.apache.maven.plugins maven-compiler-plugin 1.8 1.8 org.scala-tools maven-scala-plugin 2.15.2 scala-compile-first process-resources compile scala-test-compile process-test-resources testCompile org.codehaus.mojo build-helper-maven-plugin add-source generate-sources add-source src/main/java src/main/scala src/main/java-flink-${flink.version} src/main/scala-flink-${flink.version} add-test-source generate-test-sources add-test-source src/test/scala org.apache.maven.plugins maven-eclipse-plugin 2.10 true true org.scala-ide.sdt.core.scalanature org.eclipse.jdt.core.javanature org.scala-ide.sdt.core.scalabuilder org.scala-ide.sdt.launching.SCALA_CONTAINER org.eclipse.jdt.launching.JRE_CONTAINER org.scala-lang:scala-library org.scala-lang:scala-compiler **/*.scala **/*.java org.apache.maven.plugins maven-surefire-plugin 2.19.1 **/*.java **/*.scala org.apache.maven.plugins maven-shade-plugin 2.4.2 package shade *:* META-INF/*.SF META-INF/*.DSA META-INF/*.RSA zto-${project.artifactId}-${project.version} ================================================ FILE: docs/pom/spark-pom.xml ================================================ 4.0.0 com.zto.bigdata.spark spark-demo 1.0-SNAPSHOT 2008 compile 2.3.2-SNAPSHOT 2.6.0 1.1.0 1.2.0 1.4.0 2.5.30 2.8.10 18.0 0.9.0 4.8.0 2.12 13 0.11.0.2 2.8.0 org.apache.hive 4.8.0 0.0.3 5.1.49 18.0 2.6.0 3.5.4 UTF-8 ${scala.binary.version}.${scala.minor.version} ${spark.version}_${scala.binary.version} zto http://maven.dev.ztosys.com/nexus/content/groups/public/ true true aliyun https://maven.aliyun.com/repository/central true true huaweicloud https://mirrors.huaweicloud.com/repository/maven/ true true spark-3.0.2 3.0.2 3.0 2.10.5 2.12 13 org.apache.spark spark-avro_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.hudi hudi-spark3_${scala.binary.version} ${hudi.version} org.apache.spark spark-hive_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.hive hive-common org.apache.hive hive-exec org.apache.hive hive-metastore org.apache.hive hive-serde org.apache.hive hive-shims org.apache.spark spark-hive-thriftserver_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.hive hive-cli org.apache.hive hive-jdbc org.apache.hive hive-beeline ${hive.group} hive-cli ${hive.version} ${maven.scope} ${hive.group} hive-jdbc ${hive.version} ${maven.scope} ${hive.group} hive-beeline ${hive.version} ${maven.scope} ${hive.group} hive-common ${hive.version} ${maven.scope} ${hive.group} hive-metastore ${hive.version} ${maven.scope} ${hive.group} hive-exec ${hive.version} ${maven.scope} org.apache.commons commons-lang3 org.apache.spark spark-core_2.10 org.apache.hudi hudi-spark-bundle_${scala.binary.version} ${hudi.version} org.apache.hudi hudi-spark-client ${hudi.version} org.apache.hudi hudi-utilities-bundle_2.12 ${hudi.version} spark-2.3.2 2.3.2 2.3 2.6.7 2.11 8 io.netty netty-all ${netty.version} ${maven.scope} com.zto.fire fire-common_${scala.binary.version} ${fire.version} com.zto.fire fire-core_${scala.binary.version} ${fire.version} com.zto.fire fire-spark_${spark.reference} ${fire.version} com.zto.fire fire-enhance-spark_${spark.reference} ${fire.version} com.zto.fire fire-connector-spark-rocketmq_${spark.reference} ${fire.version} com.zto.fire fire-connector-spark-hbase_${spark.reference} ${fire.version} com.zto.fire fire-connector-hbase_${scala.binary.version} ${fire.version} com.zto.fire fire-connector-jdbc_${scala.binary.version} ${fire.version} org.scala-lang scala-library ${scala.version} org.scala-lang scala-compiler ${scala.version} org.scala-lang scala-reflect ${scala.version} com.fasterxml.jackson.core jackson-databind 2.10.0 ${maven.scope} com.fasterxml.jackson.core jackson-core 2.10.0 ${maven.scope} org.apache.spark spark-core_${scala.binary.version} com.esotericsoftware.kryo kryo ${spark.version} ${maven.scope} org.apache.spark spark-sql_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.spark spark-streaming_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.spark spark-hive_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.spark spark-sql-kafka-0-10_${scala.binary.version} ${spark.version} org.apache.spark spark-streaming_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.spark spark-streaming-kafka-0-10_${scala.binary.version} ${spark.version} org.apache.hadoop hadoop-common ${hadoop.version} ${maven.scope} org.apache.hadoop hadoop-hdfs ${hadoop.version} ${maven.scope} org.apache.hadoop hadoop-client ${hadoop.version} ${maven.scope} org.apache.hbase hbase-common ${hbase.version} org.apache.hbase hbase-client org.apache.hbase hbase-server ${hbase.version} org.apache.hbase hbase-client org.apache.hbase hbase-client ${hbase.version} org.apache.rocketmq rocketmq-client ${rocketmq.version} org.apache.hudi hudi-spark-bundle_${scala.binary.version} 0.7.0 ${maven.scope} ru.yandex.clickhouse clickhouse-jdbc 0.2.4 ${maven.scope} com.google.guava guava ${guava.version} true org.apache.maven.plugins maven-compiler-plugin 1.8 1.8 org.scala-tools maven-scala-plugin 2.15.2 scala-compile-first process-resources compile scala-test-compile process-test-resources testCompile org.codehaus.mojo build-helper-maven-plugin add-source generate-sources add-source src/main/java src/main/scala src/main/java-spark-${spark.version} src/main/scala-spark-${spark.version} add-test-source generate-test-sources add-test-source src/test/scala org.apache.maven.plugins maven-eclipse-plugin 2.10 true true org.scala-ide.sdt.core.scalanature org.eclipse.jdt.core.javanature org.scala-ide.sdt.core.scalabuilder org.scala-ide.sdt.launching.SCALA_CONTAINER org.eclipse.jdt.launching.JRE_CONTAINER org.scala-lang:scala-library org.scala-lang:scala-compiler **/*.scala **/*.java org.apache.maven.plugins maven-surefire-plugin 2.19.1 **/*.java **/*.scala org.apache.maven.plugins maven-shade-plugin 2.4.2 package shade *:* META-INF/*.SF META-INF/*.DSA META-INF/*.RSA zto-${project.artifactId}-${project.version} ================================================ FILE: docs/properties.md ================================================ # fire框架参数   Fire框架提供了很多参数,这些参数为个性化调优带来了很大的灵活性。参数大体分为:**fire框架参数**(*fire.properties*)、**spark引擎参数**(*spark.properties*)、**flink引擎参数**(*flink.properties*)、**kafka参数**、**hbase参数**等。详见以下列表: # 一、fire框架参数 | 参数 | 默认值 | 含义 | 生效版本 | 是否废弃 | | --------------------------------------------------- | ------------------- | ------------------------------------------------------------ | -------- | -------- | | fire.thread.pool.size | 5 | fire内置线程池大小 | 0.4.0 | 否 | | fire.thread.pool.schedule.size | 5 | fire内置定时任务线程池大小 | 0.4.0 | 否 | | fire.rest.enable | true | 用于配置是否启用fire框架内置的restful服务,可用于与平台系统做集成。 | 0.3.0 | 否 | | fire.conf.show.enable | true | 是否打印非敏感的配置信息 | 0.1.0 | 否 | | fire.rest.url.show.enable | false | 是否在日志中打印fire框架restful服务地址 | 0.3.0 | 否 | | fire.rest.url.hostname | false | 是否启用hostname作为rest服务的访问地址 | 2.0.0 | 否 | | fire.acc.enable | true | 是否启用fire框架内置的所有累加器 | 0.4.0 | 否 | | fire.acc.log.enable | true | 是否启用fire框架日志累加器 | 0.4.0 | 否 | | fire.acc.multi.counter.enable | true | 是否启用多值累加器 | 0.4.0 | 否 | | fire.acc.multi.timer.enable | true | 是否启用时间维度累加器 | 0.4.0 | 否 | | fire.log.enable | true | fire框架埋点日志开关,关闭以后将不再打印埋点日志 | 0.4.0 | 否 | | fire.log.sql.length | 100 | 用于限定fire框架中sql日志的字符串长度 | 0.4.1 | 否 | | fire.jdbc.storage.level | memory_and_disk_ser | fire框架针对jdbc操作后数据集的缓存策略,避免重复查询数据库 | 0.4.0 | 否 | | fire.jdbc.query.partitions | 10 | 通过JdbcConnector查询后将数据集放到多少个分区中,需根据实际的结果集做配置 | 0.3.0 | 否 | | fire.task.schedule.enable | true | 是否启用fire框架定时任务,基于quartz实现 | 0.4.0 | 否 | | fire.dynamic.conf.enable | true | 是否启用动态配置功能,fire框架允许在运行时更新用户配置信息,比如:rdd.repartition(this.conf.getInt(count)),此处可实现动态的改变分区大小,实现动态调优。 | 0.4.0 | 否 | | fire.restful.max.thread | 8 | fire框架rest接口服务最大线程数,如果平台调用fire接口比较频繁,建议调大。 | 0.4.0 | 否 | | fire.quartz.max.thread | 8 | quartz最大线程池大小,如果任务中的定时任务比较多,建议调大。 | 0.4.0 | 否 | | fire.acc.log.min.size | 500 | 收集日志记录保留的最小条数。 | 0.4.0 | 否 | | fire.acc.log.max.size | 1000 | 收集日志记录保留的最大条数。 | 0.4.0 | 否 | | fire.acc.timer.max.size | 1000 | timer累加器保留最大的记录数 | 0.4.0 | 否 | | fire.acc.timer.max.hour | 12 | timer累加器清理几小时之前的记录 | 0.4.0 | 否 | | fire.acc.env.enable | true | env累加器开关 | 0.4.0 | 否 | | fire.acc.env.max.size | 500 | env累加器保留最多的记录数 | 0.4.0 | 否 | | fire.acc.env.min.size | 100 | env累加器保留最少的记录数 | 0.4.0 | 否 | | fire.scheduler.blacklist | | 定时调度任务黑名单,配置的value为定时任务方法名,多个以逗号分隔,配置黑名单的方法将不会被quartz定时调度。 | 0.4.1 | 否 | | fire.conf.print.blacklist | .map.,pass,secret | 配置打印黑名单,含有配置中指定的片段将不会被打印,也不会被展示到spark&flink的webui中。 | 0.4.2 | 否 | | fire.restful.port.retry_num | 3 | 启用fire restserver可能会因为端口冲突导致失败,通过该参数可允许fire重试几次。 | 1.0.0 | 否 | | fire.restful.port.retry_duration | 1000 | 端口重试间隔时间(ms) | 1.0.0 | 否 | | fire.log.level.conf.org.apache.spark | info | 用于设置某个包的日志级别,默认将spark包所有的类日志级别设置为info | 1.0.0 | 否 | | fire.deploy_conf.enable | true | 是否进行累加器的分布式初始化 | 0.4.0 | 否 | | fire.exception_bus.size | 1000 | 用于限制每个jvm实例内部queue用于存放异常对象数最大大小,避免队列过大造成内存溢出 | 2.0.0 | 否 | | fire.buried_point.datasource.enable | true | 是否开启数据源埋点,开启后fire将自动采集任务用到的数据源信息(kafka、jdbc、hbase、hive等)。 | 2.0.0 | 否 | | fire.buried_point.datasource.max.size | 200 | 用于存放埋点的队列最大大小,超过该大小将会被丢弃 | 2.0.0 | 否 | | fire.buried_point.datasource.initialDelay | 30 | 定时解析埋点SQL的初始延迟(s) | 2.0.0 | 否 | | fire.buried_point.datasource.period | 60 | 定时解析埋点SQL的执行频率(s) | 2.0.0 | 否 | | fire.buried_point.datasource.map.tidb | 4000 | 用于jdbc url的识别,当无法通过driver class识别数据源时,将从url中的端口号进行区分,不同数据配置使用统一的前缀:fire.buried_point.datasource.map. | 2.0.0 | 否 | | fire.conf.adaptive.prefix | true | 是否开启配置自适应前缀,自动为配置加上引擎前缀(spark.\|flink.) | 2.0.0 | 否 | | fire.user.common.conf | common.properties | 用户统一配置文件,允许用户在该配置文件中存放公共的配置信息,优先级低于任务配置文件(多个以逗号分隔) | 2.0.0 | 否 | | fire.shutdown.auto.exit | true | 是否在调用shutdown方法时主动退出jvm进程,如果为true,则执行到this.stop方法,关闭上下文信息,回收线程池后将调用System.exit(0)强制退出进程。 | 2.0.0 | 否 | | fire.kafka.cluster.map.test | ip1:9092,ip2:9092 | kafka集群名称与集群地址映射,便于用户配置中通过别名即可消费指定的kafka。比如:kafka.brokers.name=test则表明消费ip1:9092,ip2:9092这个kafka集群。当然,也支持直接配置url:kafka.brokers.name=ip1:9092,ip2:9092。 | 0.1.0 | 否 | | fire.hive.default.database.name | tmp | 默认的hive数据库 | 0.1.0 | 否 | | fire.hive.table.default.partition.name | ds | 默认的hive分区字段名称 | 0.1.0 | 否 | | fire.hive.cluster.map.test | thrift://ip:9083 | 测试集群hive metastore地址(别名:test),任务中就可以通过fire.hive.cluster=test这种配置方式指定连接test对应的thrift server地址。 | | | | fire.hbase.batch.size | 10000 | 单个线程读写HBase的数据量 | 0.1.0 | 否 | | fire.hbase.storage.level | memory_and_disk_ser | fire框架针对hbase操作后数据集的缓存策略,避免因懒加载或其他原因导致的重复读取hbase问题,降低hbase压力。 | 0.3.2 | 否 | | fire.hbase.scan.partitions | -1 | 通过HBase scan后repartition的分区数,需根据scan后的数据量做配置,-1表示不生效。 | 0.3.2 | 否 | | fire.hbase.table.exists.cache.enable | true | 是否开启HBase表存在判断的缓存,开启后表存在判断将避免大量的connection消耗 | 2.0.0 | 否 | | fire.hbase.table.exists.cache.reload.enable | true | 是否开启HBase表存在列表缓存的定时更新任务,避免hbase表被drop导致报错。 | 2.0.0 | 否 | | fire.hbase.table.exists.cache.initialDelay | 60 | 定时刷新缓存HBase表任务的初始延迟(s) | 2.0.0 | 否 | | fire.hbase.table.exists.cache.period | 600 | 定时刷新缓存HBase表任务的执行频率(s) | 2.0.0 | 否 | | fire.hbase.cluster.map.test | zk1:2181,zk2:2181 | 测试集群hbase的zk地址(别名:test) | 2.0.0 | 否 | | fire.hbase.conf.hbase.zookeeper.property.clientPort | 2181 | hbase connection 配置,约定以:fire.hbase.conf.开头,比如:fire.hbase.conf.hbase.rpc.timeout对应hbase中的配置为hbase.rpc.timeout | 2.0.0 | 否 | | fire.config_center.enable | true | 是否在任务启动时从配置中心获取配置文件,以便实现动态覆盖jar包中的配置信息。 | 1.0.0 | 否 | | fire.config_center.local.enable | false | 本地运行环境下(Windows、Mac)是否调用配置中心接口获取配置信息。 | 1.0.0 | 否 | | fire.config_center.register.conf.secret | | 配置中心接口调用秘钥 | 1.0.0 | 否 | | fire.config_center.register.conf.prod.address | | 配置中心接口地址 | 0.4.1 | 否 | # 二、Spark引擎参数 | 参数 | 默认值 | 含义 | 生效版本 | 是否废弃 | | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | -------- | -------- | | spark.appName | | spark的应用名称,为空则取默认获取类名 | 0.1.0 | 否 | | spark.local.cores | * | spark local模式下使用多少core运行,默认为local[*],自动根据当前pc的cpu核心数设置 | 0.4.1 | 否 | | spark.chkpoint.dir | | spark checkpoint目录地址 | 0.1.0 | 否 | | spark.log.level | WARN | spark的日志级别 | 0.1.0 | 否 | | spark.fire.scheduler.blacklist | jvmMonitor | 定时任务黑名单,指定到@Scheduled所修饰的方法名,多个以逗号分隔。当配置了黑名单后,该定时任务将不会被定时调用。 | 0.4.0 | 否 | | spark.kafka.group.id | 指定spark消费kafka的groupId | kafka的groupid,为空则取类名 | 0.1.0 | 否 | | spark.kafka.brokers.name | | 用于配置任务消费的kafka broker地址,如果通过fire.kafka.cluster.map.xxx指定了broker别名,则此处也可以填写别名。 | 0.1.0 | 否 | | spark.kafka.topics | | 消费的topic列表,多个以逗号分隔 | 0.1.0 | 否 | | spark.kafka.starting.offsets | latest | 用于配置启动时的消费位点,默认取最新 | 0.1.0 | 否 | | spark.kafka.failOnDataLoss | true | 数据丢失时执行失败 | 0.1.0 | 否 | | spark.kafka.enable.auto.commit | false | 是否启用自动commit kafka的offset | 0.4.0 | 否 | | spark.kafka.conf.xxx | | 以spark.kafka.conf开头加上kafka参数,则可用于设置kafka相关的参数。比如:spark.kafka.conf.request.timeout.ms对应kafka的request.timeout.ms参数。 | 0.4.0 | 否 | | spark.hive.cluster | | 用于配置spark连接的hive thriftserver地址,支持url和别名两种配置方式。别名需要事先通过fire.hive.cluster.map.别名 = thrift://ip:9083指定。 | 0.1.0 | 否 | | spark.rocket.cluster.map.别名 | ip:9876 | rocketmq别名列表 | 1.0.0 | 否 | | spark.rocket.conf.xxx | | 以spark.rocket.conf开头的配置支持所有rocket client的配置 | 1.0.0 | 否 | | spark.hdfs.ha.enable | true | 是否启用hdfs的ha配置,避免将hdfs-site.xml、core-site.xml放到resources中导致多hadoop集群hdfs不灵活的问题。同时也可以避免引namenode维护导致spark任务挂掉的问题。 | 1.0.0 | 否 | | spark.hdfs.ha.conf.test.fs.defaultFS | hdfs://nameservice1 | 对应fs.defaultFS,其中test与fire.hive.cluster.map.test中指定的别名test相对应,当通过fire.hive.cluster=test指定读写test这个hive时,namenode的ha将生效。 | 1.0.0 | 否 | | spark.hdfs.ha.conf.test.dfs.nameservices | nameservice1 | 对应dfs.nameservices | 1.0.0 | 否 | | spark.hdfs.ha.conf.test.dfs.ha.namenodes.nameservice1 | namenode5231,namenode5229 | 对应dfs.ha.namenodes.nameservice1 | 1.0.0 | 否 | | spark.hdfs.ha.conf.test.dfs.namenode.rpc-address.nameservice1.namenode5231 | ip:8020 | 对应dfs.namenode.rpc-address.nameservice1.namenode5231 | 1.0.0 | 否 | | spark.hdfs.ha.conf.test.dfs.namenode.rpc-address.nameservice1.namenode5229 | ip2:8020 | 对应dfs.namenode.rpc-address.nameservice1.namenode5229 | 1.0.0 | 否 | | spark.hdfs.ha.conf.test.dfs.client.failover.proxy.provider.nameservice1 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider | 对应dfs.client.failover.proxy.provider.nameservice1 | 1.0.0 | 否 | | spark.impala.connection.url | jdbc:hive2://ip:21050/;auth=noSasl | impala jdbc地址 | 0.1.0 | 否 | | spark.impala.jdbc.driver.class.name | org.apache.hive.jdbc.HiveDriver | impala jdbc驱动 | 0.1.0 | 否 | | spark.datasource.options. | | 以此开头的配置将被加载到datasource api的options中 | 2.0.0 | 否 | | spark.datasource.format | | datasource api的format | 2.0.0 | 否 | | spark.datasource.saveMode | Append | datasource api的saveMode | 2.0.0 | 否 | | spark.datasource.saveParam | | 用于dataFrame.write.format.save()参数 | 2.0.0 | 否 | | spark.datasource.isSaveTable | false | 用于决定调用save(path)还是saveAsTable | 2.0.0 | 否 | | spark.datasource.loadParam | | 用于spark.read.format.load()参数 | 2.0.0 | 否 | # 三、Flink引擎参数 | 参数 | 默认值 | 含义 | 生效版本 | 是否废弃 | | ------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | -------- | -------- | | flink.appName | | flink的应用名称,为空则取类名 | 1.0.0 | 否 | | flink.kafka.group.id | | kafka的groupid,为空则取类名 | 1.0.0 | 否 | | flink.kafka.brokers.name | | 用于配置任务消费的kafka broker地址,如果通过fire.kafka.cluster.map.xxx指定了broker别名,则此处也可以填写别名。 | 1.0.0 | 否 | | flink.kafka.topics | | 消费的kafka topic列表,多个以逗号分隔 | 1.0.0 | 否 | | flink.kafka.starting.offsets | | 用于配置启动时的消费位点,默认取最新 | 1.0.0 | 否 | | flink.kafka.failOnDataLoss | true | 数据丢失时执行失败 | 1.0.0 | 否 | | flink.kafka.enable.auto.commit | false | 是否启用自动提交kafka offset | 1.0.0 | 否 | | flink.kafka.CommitOffsetsOnCheckpoints | true | 是否在checkpoint时记录offset值 | 1.0.0 | 否 | | flink.kafka.StartFromTimestamp | 0 | 设置从指定时间戳位置开始消费kafka | 1.0.0 | 否 | | flink.kafka.StartFromGroupOffsets | false | 从topic中指定的group上次消费的位置开始消费,必须配置group.id参数 | 1.0.0 | 否 | | flink.log.level | WARN | 默认的日志级别 | 1.0.0 | 否 | | flink.hive.cluster | | 用于配置flink读写的hive集群别名 | 1.0.0 | 否 | | flink.hive.version | | 指定hive版本号 | 1.0.0 | 否 | | flink.default.database.name | tmp | 默认的hive数据库 | 1.0.0 | 否 | | flink.default.table.partition.name | ds | 默认的hive分区字段名称 | 1.0.0 | 否 | | flink.hive.catalog.name | hive | hive的catalog名称 | 1.0.0 | 否 | | flink.fire.hive.site.path.map.别名 | test | /path/to/hive-site-path/ | 1.0.0 | 否 | | flink.hbase.cluster | test | 读写的hbase集群zk地址 | 1.0.0 | 否 | | flink.max.parallelism | | 用于配置flink的max parallelism | 1.0.0 | 否 | | flink.default.parallelism | | 用于配置任务默认的parallelism | 1.0.0 | 否 | | flink.stream.checkpoint.interval | -1 | checkpoint频率,-1表示关闭 | 1.0.0 | 否 | | flink.stream.checkpoint.mode | EXACTLY_ONCE | checkpoint的模式:EXACTLY_ONCE/AT_LEAST_ONCE | 1.0.0 | 否 | | flink.stream.checkpoint.timeout | 600000 | checkpoint超时时间,单位:毫秒 | 1.0.0 | 否 | | flink.stream.checkpoint.max.concurrent | 1 | 同时checkpoint操作的并发数 | 1.0.0 | 否 | | flink.stream.checkpoint.min.pause.between | 0 | 两次checkpoint的最小停顿时间 | 1.0.0 | 否 | | flink.stream.checkpoint.prefer.recovery | false | 如果有更近的checkpoint时,是否将作业回退到该检查点 | 1.0.0 | 否 | | flink.stream.checkpoint.tolerable.failure.number | 0 | 可容忍checkpoint失败的次数,默认不允许失败 | 1.0.0 | 否 | | flink.stream.checkpoint.externalized | RETAIN_ON_CANCELLATION | 当cancel job时保留checkpoint | 1.0.0 | 否 | | flink.sql.log.enable | false | 是否打印组装with语句后的flink sql,由于with表达式中可能含有敏感信息,默认为关闭 | 2.0.0 | 否 | | flink.sql.with.xxx | flink.sql.with.connector=jdbc flink.sql.with.url=jdbc:mysql://ip:3306/db | 以flink.sql.with.开头的配置,用于sql语句的with表达式。通过this.fire.sql(sql, keyNum)即可自动读取并映射成with表达式的sql。避免sql中的with表达式硬编码到代码中,提高灵活性。 | 2.0.0 | 否 | | flink.sql_with.replaceMode.enable | false | 是否启用配置文件中with强制替换sql中已有的with表达式,如果启用,则会强制替换掉代码中sql的with列表,达到最大的灵活性。 | 2.0.0 | 否 | | flink.sql.udf.fireUdf.enable | false | 是否启用fire注册外部udf jar包中的类为发flink sql的udf函数 | 2.0.0 | 否 | | flink.sql.conf.pipeline.jars | /path/to/udf/jar/ | 用于指定udf jar包路径 | 2.0.0 | 否 | | flink.sql.udf.conf.xxx | 包名+类名 | 用于指定udf函数名称与类名的对应关系,比如函数名为test,包名为com.udf.Udf,则配置为:flink.sql.udf.conf.test=com.udf.Udf | 2.0.0 | 否 | ================================================ FILE: docs/restful.md ================================================ # fire内置的restful接口 ​ fire框架在提供丰富好用的api给开发者的同时,也提供了大量的restful接口给大数据实时计算平台。通过对外暴露的restful接口,可以将每个任务与实时平台进行深入绑定,为平台建设提供了更大的想象空间。其中包括:**实时热重启接口、动态批次时间调整接口、sql在线调试接口**、**Arthas诊断jvm**、**实时血缘分析**等。 | 引擎 | 接口 | 含义 | | --------- | ---------------------------- | ------------------------------------------------------------ | | 通用 | /system/kill | 用于kill 任务自身。 | | 通用 | /system/cancelJob | 生产环境中,通常会禁用掉spark webui的kill功能,但有时任务owner有kill的需求,为了满足此类需求,fire通过接口的方式将kill功能暴露给平台,由平台控制权限并完成kill job的触发。 | | 通用 | /system/cancelStage | 同job的kill功能,该接口用于kill指定的stage。 | | 通用 | /system/sql | 该接口允许用户传递sql给spark任务执行,可用于sql的动态调试,支持在任务开发阶段spark临时表与hive表的关联,降低sql开发的人力成本。 | | 通用 | /system/sparkInfo | 用户获取当前spark任务的配置信息。 | | 通用 | /system/counter | 用于获取累加器的值。 | | 通用 | /system/multiCounter | 用于获取多值累加器的值。 | | 通用 | /system/multiTimer | 用于获取时间维度多值累加器的值。 | | 通用 | /system/log | 用于获取日志信息,平台可调用该接口获取日志并进行日志展示。 | | 通用 | /system/env | 获取运行时状态信息,包括GC、jvm、thread、memory、cpu等 | | 通用 | /system/listDatabases | 用于列举当前spark任务catalog中所有的数据库,包括hive库等。 | | 通用 | /system/listTables | 用于列举指定库下所有的表信息。 | | 通用 | /system/listColumns | 用于列举某张表的所有字段信息。 | | spark通用 | /system/listFunctions | 用于列举当前任务支持的函数。 | | 通用 | /system/setConf | 用于配置热覆盖,在运行时动态修改指定的配置信息。比如动态修改spark streaming某个rdd的分区数,实现动态调优的目的。 | | 通用 | /system/datasource | 用于获取当前任务使用到的数据源信息、表信息等。支持jdbc、hbase、kafka、hive等众多组件,可用于和平台集成,做实时血缘关系。 | | spark | /system/streaming/hotRestart | spark streaming热重启接口,可以动态的修改运行中的spark streaming的批次时间。 | | flink | /system/checkpoint | 用于运行时热修改checkpoint | | 通用 | /system/arthas | 动态开启或关闭arthas服务,用于运行时分析诊断jvm | ================================================ FILE: docs/schedule.md ================================================ # 定时任务   Fire框架内部进一步封装了quart进行定时任务的声明与调度,使用方法和spring的@Scheduled注解类似。参考:[示例程序](../fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/schedule/ScheduleTest.scala)。基于该功能,可以很容易实现诸如定时加载与更新维表等功能,十分方便。 ```scala /** * 声明了@Scheduled注解的方法是定时任务方法,会周期性执行 * * @cron cron表达式 * @scope 默认同时在driver端和executor端执行,如果指定了driver,则只在driver端定时执行 * @concurrent 上一个周期定时任务未执行完成时是否允许下一个周期任务开始执行 * @startAt 用于指定第一次开始执行的时间 * @initialDelay 延迟多长时间开始执行第一次定时任务 */ @Scheduled(cron = "0/5 * * * * ?", scope = "driver", concurrent = false, startAt = "2021-01-21 11:30:00", initialDelay = 60000) def loadTable: Unit = { this.logger.info("更新维表动作") } /** * 只在driver端执行,不允许同一时刻同时执行该方法 * startAt用于指定首次执行时间 */ @Scheduled(cron = "0/5 * * * * ?", scope = "all", concurrent = false) def test2: Unit = { this.logger.info("executorId=" + SparkUtils.getExecutorId + "=方法 test2() 每5秒执行" + DateFormatUtils.formatCurrentDateTime()) } // 每天凌晨4点01将锁标志设置为false,这样下一个批次就可以先更新维表再执行sql @Scheduled(cron = "0 1 4 * * ?") def updateTableJob: Unit = this.lock.compareAndSet(true, false) ``` **注:**目前定时任务不支持flink任务在每个TaskManager端执行。 ================================================ FILE: docs/threadpool.md ================================================ # 线程池与并发计算 集成Fire后,可以很简单的在程序内部进行多个任务的提交,充分榨干申请到的资源。 ```scala /** * 在driver中启用线程池的示例 * 1. 开启子线程执行一个任务 * 2. 开启子线程执行周期性任务 */ object ThreadTest extends BaseSparkStreaming { def main(args: Array[String]): Unit = { // 第二个参数为true表示开启checkPoint机制 this.init(10L, false) } /** * Streaming的处理过程强烈建议放到process中,保持风格统一 * 注:此方法会被自动调用,在以下两种情况下,必须将逻辑写在process中 * 1. 开启checkpoint * 2. 支持streaming热重启(可在不关闭streaming任务的前提下修改batch时间) */ override def process: Unit = { // 第一次执行时延迟两分钟,每隔1分钟执行一次showSchema函数 this.runAsSchedule(this.showSchema, 1, 1) // 以子线程方式执行print方法中的逻辑 this.runAsThread(this.print) val dstream = this.fire.createKafkaDirectStream() dstream.foreachRDD(rdd => { println("count--> " + rdd.count()) }) this.fire.start } /** * 以子线程方式执行一次 */ def print: Unit = { println("==========子线程执行===========") } /** * 查看表结构信息 */ def showSchema: Unit = { println(s"${DateFormatUtils.formatCurrentDateTime()}--------> atFixRate <----------") this.fire.sql("use tmp") this.fire.sql("show tables").show(false) } } ``` ================================================ FILE: fire-common/pom.xml ================================================ 4.0.0 fire-common_${scala.binary.version} jar Fire : Common com.zto.fire fire-parent 2.3.2-SNAPSHOT ../pom.xml org.apache.kafka kafka_${scala.binary.version} ${kafka.version} ${maven.scope} org.apache.rocketmq rocketmq-client ${rocketmq.version} ${maven.scope} commons-httpclient commons-httpclient 3.1 org.apache.httpcomponents httpclient 4.3.3 org.apache.httpcomponents httpcore 4.4.3 org.apache.htrace htrace-core 3.2.0-incubating org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/anno/Config.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.anno; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行任务的配置,支持纯注解方式进行任务的参数配置以及指定多个配置文件 * * @author ChengLong 2021-8-3 10:49:30 * @since 2.1.1 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface Config { /** * 配置文件名称列表 */ String[] files() default ""; /** * 配置项列表,key=value的字符串形式 */ String[] props() default ""; /** * 配置的字符串 */ String value() default ""; } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/anno/FieldName.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.anno; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 用于标识该field对应数据库中的名称 * Created by ChengLong on 2017-03-15. */ @Retention(RetentionPolicy.RUNTIME) @Target({ElementType.TYPE, ElementType.FIELD}) public @interface FieldName { /** * fieldName,映射到hbase中作为qualifier名称 */ String value() default ""; /** * 列族名称 */ String family() default "info"; /** * 不使用该字段,默认为使用 */ boolean disuse() default false; /** * 是否可以为空 */ boolean nullable() default true; /** * 是否为主键字段 * @return */ boolean id() default false; /** * HBase表的命名空间 */ String namespace() default "default"; /** * 字段注释 */ String comment() default ""; } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/anno/FireConf.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.anno; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行任务的配置,支持纯注解方式进行任务的参数配置以及指定多个配置文件 * * @author ChengLong 2022-08-18 08:57:23 * @since 2.3.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface FireConf { /** * 配置文件名称列表 */ String[] files() default ""; /** * 配置项列表,key=value的字符串形式 */ String[] props() default ""; /** * 配置的字符串 */ String value() default ""; } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/anno/Internal.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.anno; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * For fire internal use only. * * @author ChengLong 2020-11-13 09:39:28 */ @Retention(RetentionPolicy.SOURCE) @Target({ElementType.TYPE, ElementType.FIELD, ElementType.METHOD}) public @interface Internal { } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/anno/Rest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.anno; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 用于标识启用restful接口 * * @author ChengLong 2019-4-16 11:07:13 */ @Retention(RetentionPolicy.RUNTIME) @Target({ElementType.TYPE, ElementType.FIELD}) public @interface Rest { /** * restful路径名 * * @return */ String value() default ""; /** * 接口访问的方式: GET/POST * * @return */ String method() default "GET"; } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/anno/Scheduled.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.anno; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 定时任务注解,放在方法上,要求方法不带参数,且无返回值 * 优先级:cron > fixedInterval startAt > initialDelay * @author ChengLong 2019年11月4日 21:12:06 * @since 0.3.5 */ @Retention(RetentionPolicy.RUNTIME) @Target({ElementType.METHOD}) public @interface Scheduled { /** * cron表达式 */ String cron() default ""; /** * 指定是否允许并发执行同一个任务 * 默认为true,表示同一时间范围内同一个任务可以有多个实例并行执行 */ boolean concurrent() default true; /** * 按照给定的时间间隔(毫秒)周期性执行 */ long fixedInterval() default -1; /** * 周期性执行的次数,-1表示无限重复执行 */ long repeatCount() default -1; /** * 第一次延迟多久(毫秒)执行,0表示立即执行 */ long initialDelay() default -1; /** * 用于指定首次开始执行的时间,优先级高于initialDelay * 日期的格式为:yyyy-MM-dd HH:mm:ss */ String startAt() default ""; /** * 定时任务的作用域,driver、executor、all * 默认仅driver端执行 */ String scope() default "driver"; } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/anno/TestStep.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.anno; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 用于标识单元测试的测试步骤 * * @author ChengLong 2020-11-13 09:39:28 */ @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.METHOD) public @interface TestStep { /** * 测试步骤 */ int step() default 1; /** * 用于单元测试描述 */ String desc() default "单元测试说明"; } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/FireTask.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean; import com.zto.fire.common.util.*; /** * 用于封装Fire框架任务的基本信息 * * @author ChengLong 2022-08-30 14:44:03 * @since 2.3.2 */ public class FireTask { /** * 触发异常的执行引擎:spark/flink */ protected String engine; /** * 引擎版本 */ protected String engineVersion; /** * fire框架版本 */ protected String fireVersion; /** * 异常所在jvm进程发送的主机ip */ protected String ip; /** * 异常所属jvm进程所在的主机名称 */ protected String hostname; /** * 进程的pid */ protected String pid; /** * 任务的主类名:package+类名 */ protected String mainClass; /** * 异常发生的时间戳 */ protected String timestamp; /** * 任务启动时间 */ protected String launchTime; /** * 任务运行时间 */ protected Long uptime; /** * 运行时的appId */ protected String appId; /** * 任务提交模式 */ protected String deployMode; /** * spark:streaming、structured streaming、core * flink:streaming、batch */ protected String jobType; public FireTask() { this.engine = FireUtils.engine(); this.engineVersion = FireUtils.engineVersion(); this.fireVersion = FireUtils.fireVersion(); this.ip = OSUtils.getIp(); this.timestamp = DateFormatUtils.formatCurrentDateTime(); this.launchTime = DateFormatUtils.formatUnixDateTime(FireUtils.launchTime()); this.uptime = FireUtils.uptime(); this.mainClass = FireUtils.mainClass(); this.hostname = OSUtils.getHostName(); this.pid = OSUtils.getPid(); this.appId = FireUtils.applicationId(); this.deployMode = FireUtils.deployMode(); this.jobType = FireUtils.jobType().toString(); } public String getEngine() { return engine; } public void setEngine(String engine) { this.engine = engine; } public String getIp() { return ip; } public void setIp(String ip) { this.ip = ip; } public String getHostname() { return hostname; } public void setHostname(String hostname) { this.hostname = hostname; } public String getPid() { return pid; } public void setPid(String pid) { this.pid = pid; } public String getMainClass() { return mainClass; } public void setMainClass(String mainClass) { this.mainClass = mainClass; } public String getTimestamp() { return timestamp; } public void setTimestamp(String timestamp) { this.timestamp = timestamp; } public String getEngineVersion() { return engineVersion; } public void setEngineVersion(String engineVersion) { this.engineVersion = engineVersion; } public String getFireVersion() { return fireVersion; } public void setFireVersion(String fireVersion) { this.fireVersion = fireVersion; } public String getLaunchTime() { return launchTime; } public void setLaunchTime(String launchTime) { this.launchTime = launchTime; } public Long getUptime() { return uptime; } public void setUptime(Long uptime) { this.uptime = uptime; } public String getAppId() { return appId; } public void setAppId(String appId) { this.appId = appId; } public String getDeployMode() { return deployMode; } public void setDeployMode(String deployMode) { this.deployMode = deployMode; } public String getJobType() { return jobType; } public void setJobType(String jobType) { this.jobType = jobType; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/analysis/ExceptionMsg.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.analysis; import com.zto.fire.common.bean.FireTask; import com.zto.fire.common.util.*; /** * 用于封装异常堆栈信息 * * @author ChengLong 2022-08-01 09:28:04 * @since 2.3.2 */ public class ExceptionMsg extends FireTask { /** * 异常堆栈类名 */ private String exceptionClass; /** * 异常堆栈的标题 */ private String stackTitle; /** * 异常堆栈详细信息 */ private String stackTrace; /** * 发送异常的sql语句 */ private String sql; public ExceptionMsg() { super(); } public ExceptionMsg(String stackTitle, String stackTrace, String exceptionClass, String sql) { super(); this.stackTitle = stackTitle; this.stackTrace = stackTrace; this.exceptionClass = exceptionClass; this.sql = sql; this.engine = FireUtils.engine(); this.ip = OSUtils.getIp(); this.timestamp = DateFormatUtils.formatCurrentDateTime(); this.mainClass = FireUtils.mainClass(); this.hostname = OSUtils.getHostName(); this.pid = OSUtils.getPid(); } public ExceptionMsg(Throwable e, String sql) { this(e.getMessage(), ExceptionBus.stackTrace(e), e.getClass().getName(), sql); } public ExceptionMsg(Throwable e) { this(e.getMessage(), ExceptionBus.stackTrace(e), e.getClass().getName(), ""); } public ExceptionMsg(String stackTitle, String stackTrace, String exceptionClass) { this(stackTitle, stackTrace, exceptionClass, ""); } public ExceptionMsg(String stackTrace) { this("", stackTrace, "", ""); } public String getEngine() { return engine; } public void setEngine(String engine) { this.engine = engine; } public String getStackTitle() { return stackTitle; } public void setStackTitle(String stackTitle) { this.stackTitle = stackTitle; } public String getStackTrace() { return stackTrace; } public void setStackTrace(String stackTrace) { this.stackTrace = stackTrace; } public String getSql() { return sql; } public void setSql(String sql) { this.sql = sql; } public String getIp() { return ip; } public void setIp(String ip) { this.ip = ip; } public String getHostname() { return hostname; } public void setHostname(String hostname) { this.hostname = hostname; } public String getPid() { return pid; } public void setPid(String pid) { this.pid = pid; } public String getMainClass() { return mainClass; } public void setMainClass(String mainClass) { this.mainClass = mainClass; } public String getTimestamp() { return timestamp; } public void setTimestamp(String timestamp) { this.timestamp = timestamp; } @Override public String toString() { return JSONUtils.toJSONString(this); } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/config/ConfigurationParam.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.config; import com.zto.fire.common.enu.ConfigureLevel; import java.util.Map; /** * 用于解析配置中心返回的配置项: * * {"code":200,"content":{"FRAMEWORK":{"fire.thread.pool.size":"5","hive.cluster":"batch"},"TASK":{"fire.user.conf":"test","fire.conf.show.enable":"false"},"URGENT":{"hdfs.ha.conf.test.dfs.nameservices":"ns1","hdfs.ha.conf.test.fs.defaultFS":"hdfs://ns1"}}} * * @author ChengLong 2021-8-23 15:26:39 * @since 2.2.0 */ public class ConfigurationParam { private Integer code; private Map> content; public Integer getCode() { return code; } public void setCode(Integer code) { this.code = code; } public Map> getContent() { return content; } public void setContent(Map> content) { this.content = content; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/lineage/Lineage.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.lineage; import com.zto.fire.common.bean.FireTask; /** * 用于封装采集到的实时血缘信息 * * @author ChengLong 2022-08-30 15:31:32 * @since 2.3.2 */ public class Lineage extends FireTask { /** * 血缘信息 */ private Object datasource; /** * SQL血缘 */ private SQLLineage sql; public Lineage() { super(); } public Lineage(Object lineage) { super(); this.datasource = lineage; } public Lineage(Object lineage, SQLLineage sql) { this.datasource = lineage; this.sql = sql; } public Object getDatasource() { return datasource; } public void setDatasource(Object datasource) { this.datasource = datasource; } public SQLLineage getSql() { return sql; } public void setSql(SQLLineage sql) { this.sql = sql; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/lineage/SQLLineage.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.lineage; import com.zto.fire.common.util.DatasourceDesc; import java.util.LinkedList; import java.util.List; /** * 用于封装采集到SQL的实时血缘信息 * * @author ChengLong 2022-09-01 13:30:22 * @since 2.3.2 */ public class SQLLineage implements DatasourceDesc { /** * 待解析的SQL语句 */ private List statements; /** * 解析SQL中表的信息 */ private List tables; /** * 描述表与表之前的数据血缘关系 */ private List relations; public SQLLineage() { this.statements = new LinkedList<>(); this.tables = new LinkedList<>(); this.relations = new LinkedList<>(); } public List getStatements() { return statements; } public void setStatements(List statements) { this.statements = statements; } public void setTables(List tables) { this.tables = tables; } public List getTables() { return tables; } public void setRelations(List relations) { this.relations = relations; } public List getRelations() { return relations; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/lineage/SQLTable.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.lineage; import java.util.*; /** * 用于封装采集到SQL的实时血缘信息:SQL中所用到的表信息 * * @author ChengLong 2022-09-01 13:32:03 * @since 2.3.2 */ public class SQLTable { /** * Hive、Kafka、JDBC等 */ private String catalog; /** * catalog集群信息url */ private String cluster; /** * 物理表名 */ private String physicalTable; /** * 在spark或flink中注册成的临时表名 */ private String tmpView; /** * 表注释信息 */ private String comment; /** * sql中的属性信息,比如with字句的options */ private Map options; /** * 任务中对该表的操作:SELECT、DROP、CREATE等 */ private Set operation; /** * 使用到的字段列表,包括字段的名称与类型 */ private Set columns; /** * 使用到的分区信息 */ private Set partitions; public SQLTable() { this.operation = new HashSet<>(); this.columns = new HashSet<>(); this.options = new HashMap<>(); this.partitions = new HashSet<>(); } public SQLTable(String physicalTable) { this(); this.physicalTable = physicalTable; } public SQLTable(String catalog, String cluster, String physicalTable, String tmpView, String comment, HashMap options, HashSet operation, HashSet columns, HashSet partitions) { this.catalog = catalog; this.cluster = cluster; this.physicalTable = physicalTable; this.tmpView = tmpView; this.options = options; this.operation = operation; this.columns = columns; this.partitions = partitions; this.comment = comment; } public void setCatalog(String catalog) { this.catalog = catalog; } public String getCatalog() { return catalog; } public void setCluster(String cluster) { this.cluster = cluster; } public String getCluster() { return cluster; } public void setPhysicalTable(String physicalTable) { this.physicalTable = physicalTable; } public String getPhysicalTable() { return physicalTable; } public void setTmpView(String tmpView) { this.tmpView = tmpView; } public String getTmpView() { return tmpView; } public Map getOptions() { return options; } public void setOptions(Map options) { this.options = options; } public void setOperation(Set operation) { this.operation = operation; } public Set getOperation() { return operation; } public void setColumns(HashSet columns) { this.columns = columns; } public Set getColumns() { return columns; } public Set getPartitions() { return partitions; } public void setPartitions(HashSet partitions) { this.partitions = partitions; } public String getComment() { return comment; } public void setComment(String comment) { this.comment = comment; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/lineage/SQLTableColumns.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.lineage; import java.util.Objects; /** * 用于封装采集到SQL的实时血缘信息:字段级血缘 * * @author ChengLong 2022-09-01 13:34:58 * @since 2.3.2 */ public class SQLTableColumns { /** * 字段名称 */ private String name; /** * 字段类型 */ private String type; public SQLTableColumns() { } public SQLTableColumns(String name, String type) { this.name = name; this.type = type; } public void setName(String name) { this.name = name; } public String getName() { return name; } public void setType(String type) { this.type = type; } public String getType() { return type; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } SQLTableColumns that = (SQLTableColumns) o; return Objects.equals(name, that.name) && Objects.equals(type, that.type); } @Override public int hashCode() { return Objects.hash(name, type); } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/lineage/SQLTablePartitions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.lineage; import java.util.Objects; /** * 用于封装采集到SQL的实时血缘信息:分区血缘 * * @author ChengLong 2022-09-01 13:34:58 * @since 2.3.2 */ public class SQLTablePartitions { /** * 字段名称 */ private String name; /** * 字段类型 */ private String value; public SQLTablePartitions() { } public SQLTablePartitions(String name, String value) { this.name = name; this.value = value; } public void setName(String name) { this.name = name; } public String getName() { return name; } public void setValue(String value) { this.value = value; } public String getValue() { return value; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } SQLTablePartitions that = (SQLTablePartitions) o; return Objects.equals(name, that.name) && Objects.equals(value, that.value); } @Override public int hashCode() { return Objects.hash(name, value); } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/lineage/SQLTableRelations.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.lineage; import java.util.Objects; /** * 用于封装采集到SQL的实时血缘信息:描述表与表之间的关系,如:insert overwrite sinkTable select xxx from srcTable * * @author ChengLong 2022-09-01 13:31:23 * @since 2.3.2 */ public class SQLTableRelations { /** * 源表:SELECT */ private String srcTable; /** * 目标表:INSERT、CREATE */ private String sinkTable; public SQLTableRelations() { } public SQLTableRelations(String srcTable, String sinkTable) { this.srcTable = srcTable; this.sinkTable = sinkTable; } public void setSrcTable(String srcTable) { this.srcTable = srcTable; } public String getSrcTable() { return srcTable; } public void setSinkTable(String sinkTable) { this.sinkTable = sinkTable; } public String getSinkTable() { return sinkTable; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } SQLTableRelations that = (SQLTableRelations) o; return Objects.equals(srcTable, that.srcTable) && Objects.equals(sinkTable, that.sinkTable); } @Override public int hashCode() { return Objects.hash(srcTable, sinkTable); } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/rest/ResultMsg.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.rest; import com.zto.fire.common.enu.ErrorCode; import com.zto.fire.common.util.JSONUtils; import org.apache.commons.lang3.StringUtils; import java.util.Objects; /** * 返回消息 * * @author ChengLong 2018年6月12日 13:42:23 */ public class ResultMsg { // 消息体 private Object content; // 系统错误码 private ErrorCode code; // 错误描述 private String msg; /** * 验证是否成功 * * @param resultMsg * @return true: 成功 false 失败 */ public static boolean isSuccess(ResultMsg resultMsg) { return resultMsg != null && resultMsg.getCode() == ErrorCode.SUCCESS; } /** * 获取描述信息 * * @param resultMsg * @return 描述信息 */ public static String getMsg(ResultMsg resultMsg) { if (resultMsg != null) { return resultMsg.getMsg(); } else { return ""; } } /** * 获取状态码 * * @return 状态码 */ public static ErrorCode getCode(ResultMsg resultMsg) { if (resultMsg != null) { return resultMsg.getCode(); } return ErrorCode.ERROR; } public ResultMsg() { } public ResultMsg(String content, ErrorCode code, String msg) { this.content = content; this.code = code; this.msg = msg; } public Object getContent() { return content; } public void setContent(Object content) { this.content = content; } public ErrorCode getCode() { return code; } public void setCode(ErrorCode code) { this.code = code; } public String getMsg() { return msg; } public void setMsg(String msg) { this.msg = msg; } /** * 构建成功消息 */ public static String buildSuccess(Object content, String msg) { return new ResultMsg(Objects.toString(content, ""), ErrorCode.SUCCESS, msg).toString(); } /** * 构建失败消息 */ public static String buildError(String msg, ErrorCode errorCode) { return new ResultMsg("", errorCode, msg).toString(); } @Override public String toString() { return JSONUtils.toJSONString(this); } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/rest/yarn/App.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.rest.yarn; /** * 用于解析调用yarn接口返回的json * @author ChengLong 2019-5-15 17:50:06 */ public class App { // yarn applicationId private String id; // yarn程序的启动用户 private String user; // yarn程序名称 private String name; // yarn的队列名称 private String queue; // 程序的状态 private String state; // 程序的最终状态 private String finalStatus; // 执行进度 private Double progress; // 程序的ui private String trackingUI; // 程序ui的url地址 private String trackingUrl; // 诊断 private String diagnostics; // 集群id private Long clusterId; // 程序类型(spark、mr) private String applicationType; // 程序的标签 private String applicationTags; // 程序启动时间 private Long startedTime; // 程序结束时间 private Long finishedTime; // 程序执行时间 private Long elapsedTime; // master 的日志路径 private String amContainerLogs; // master所在主机host名称 private String amHostHttpAddress; // 已分配的内存大小 private Long allocatedMB; // 已分配的cpu数量 private Long allocatedVCores; // 运行的container数量 private Long runningContainers; // 内存时间 private Long memorySeconds; // cpu时间 private Long vcoreSeconds; // 占用的内存大小 private Long preemptedResourceMB; // 占用的cpu数量 private Long preemptedResourceVCores; private Long numNonAMContainerPreempted; private Long numAMContainerPreempted; // yarn的日志聚合状态(NOT_START、SUCCEEDED) private String logAggregationStatus; public String getId() { return id; } public void setId(String id) { this.id = id; } public String getUser() { return user; } public void setUser(String user) { this.user = user; } public String getName() { return name; } public void setName(String name) { this.name = name; } public String getQueue() { return queue; } public void setQueue(String queue) { this.queue = queue; } public String getState() { return state; } public void setState(String state) { this.state = state; } public String getFinalStatus() { return finalStatus; } public void setFinalStatus(String finalStatus) { this.finalStatus = finalStatus; } public Double getProgress() { return progress; } public void setProgress(Double progress) { this.progress = progress; } public String getTrackingUI() { return trackingUI; } public void setTrackingUI(String trackingUI) { this.trackingUI = trackingUI; } public String getTrackingUrl() { return trackingUrl; } public void setTrackingUrl(String trackingUrl) { this.trackingUrl = trackingUrl; } public String getDiagnostics() { return diagnostics; } public void setDiagnostics(String diagnostics) { this.diagnostics = diagnostics; } public Long getClusterId() { return clusterId; } public void setClusterId(Long clusterId) { this.clusterId = clusterId; } public String getApplicationType() { return applicationType; } public void setApplicationType(String applicationType) { this.applicationType = applicationType; } public String getApplicationTags() { return applicationTags; } public void setApplicationTags(String applicationTags) { this.applicationTags = applicationTags; } public Long getStartedTime() { return startedTime; } public void setStartedTime(Long startedTime) { this.startedTime = startedTime; } public Long getFinishedTime() { return finishedTime; } public void setFinishedTime(Long finishedTime) { this.finishedTime = finishedTime; } public Long getElapsedTime() { return elapsedTime; } public void setElapsedTime(Long elapsedTime) { this.elapsedTime = elapsedTime; } public String getAmContainerLogs() { return amContainerLogs; } public void setAmContainerLogs(String amContainerLogs) { this.amContainerLogs = amContainerLogs; } public String getAmHostHttpAddress() { return amHostHttpAddress; } public void setAmHostHttpAddress(String amHostHttpAddress) { this.amHostHttpAddress = amHostHttpAddress; } public Long getAllocatedMB() { return allocatedMB; } public void setAllocatedMB(Long allocatedMB) { this.allocatedMB = allocatedMB; } public Long getAllocatedVCores() { return allocatedVCores; } public void setAllocatedVCores(Long allocatedVCores) { this.allocatedVCores = allocatedVCores; } public Long getRunningContainers() { return runningContainers; } public void setRunningContainers(Long runningContainers) { this.runningContainers = runningContainers; } public Long getMemorySeconds() { return memorySeconds; } public void setMemorySeconds(Long memorySeconds) { this.memorySeconds = memorySeconds; } public Long getVcoreSeconds() { return vcoreSeconds; } public void setVcoreSeconds(Long vcoreSeconds) { this.vcoreSeconds = vcoreSeconds; } public Long getPreemptedResourceMB() { return preemptedResourceMB; } public void setPreemptedResourceMB(Long preemptedResourceMB) { this.preemptedResourceMB = preemptedResourceMB; } public Long getPreemptedResourceVCores() { return preemptedResourceVCores; } public void setPreemptedResourceVCores(Long preemptedResourceVCores) { this.preemptedResourceVCores = preemptedResourceVCores; } public Long getNumNonAMContainerPreempted() { return numNonAMContainerPreempted; } public void setNumNonAMContainerPreempted(Long numNonAMContainerPreempted) { this.numNonAMContainerPreempted = numNonAMContainerPreempted; } public Long getNumAMContainerPreempted() { return numAMContainerPreempted; } public void setNumAMContainerPreempted(Long numAMContainerPreempted) { this.numAMContainerPreempted = numAMContainerPreempted; } public String getLogAggregationStatus() { return logAggregationStatus; } public void setLogAggregationStatus(String logAggregationStatus) { this.logAggregationStatus = logAggregationStatus; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/runtime/ClassLoaderInfo.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.runtime; import java.io.Serializable; import java.lang.management.ClassLoadingMXBean; import java.lang.management.ManagementFactory; /** * 获取运行时class loader信息 * @author ChengLong 2019年9月28日 19:56:18 */ public class ClassLoaderInfo implements Serializable { private static final long serialVersionUID = 4958598582046079565L; /** * 获取已加载的类数量 */ private long loadedClassCount; /** * 获取总的类加载数 */ private long totalLoadedClassCount; /** * 获取未被加载的类总数 */ private long unloadedClassCount; private ClassLoaderInfo() {} public long getLoadedClassCount() { return loadedClassCount; } public long getTotalLoadedClassCount() { return totalLoadedClassCount; } public long getUnloadedClassCount() { return unloadedClassCount; } /** * 获取类加载器相关信息 */ public static ClassLoaderInfo getClassLoaderInfo() { ClassLoaderInfo classLoaderInfo = new ClassLoaderInfo(); // 获取类加载器相关信息 ClassLoadingMXBean classLoadingMXBean = ManagementFactory.getClassLoadingMXBean(); classLoaderInfo.loadedClassCount = classLoadingMXBean.getLoadedClassCount(); classLoaderInfo.totalLoadedClassCount = classLoadingMXBean.getTotalLoadedClassCount(); classLoaderInfo.unloadedClassCount = classLoadingMXBean.getUnloadedClassCount(); return classLoaderInfo; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/runtime/CpuInfo.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.runtime; import com.sun.management.OperatingSystemMXBean; import com.zto.fire.common.util.MathUtils; import oshi.SystemInfo; import oshi.hardware.CentralProcessor; import oshi.hardware.CentralProcessor.TickType; import oshi.hardware.HardwareAbstractionLayer; import oshi.hardware.Sensors; import oshi.util.FormatUtil; import java.io.Serializable; import java.lang.management.ManagementFactory; /** * 用于封装cpu运行时信息 * * @author ChengLong 2019-9-28 19:52:56 */ public class CpuInfo implements Serializable { private static final long serialVersionUID = 7712733535989008368L; /** * 系统cpu的负载 */ private double cpuLoad; /** * 当前jvm可用的处理器数量 */ private int availableProcessors; /** * 当前jvm占用的cpu时长 */ private long processCpuTime; /** * 当前jvm占用的cpu负载 */ private double processCpuLoad; /** * cpu温度 */ private double temperature; /** * cpu电压 */ private double voltage; /** * 风扇转速 */ private int[] fanSpeeds; /** * 物理cpu数 */ private int physicalCpu; /** * 逻辑cpu数 */ private int logicalCpu; /** * 运行时间 */ private long uptime; /** * io等待 */ private long ioWait; /** * 用户时长 */ private long userTick; /** * nice时长 */ private long niceTick; /** * 系统时长 */ private long sysTick; /** * 空闲时长 */ private long idleTick; /** * 中断时长 */ private long irqTick; /** * 软中断时长 */ private long softIrqTick; /** * cpu steal 时长 */ private long stealTick; /** * cpu平均负载 */ private double[] loadAverage; /** * 最近一次平均负载 */ private double lastLoadAverage; public double[] getLoadAverage() { return this.loadAverage; } public double getLastLoadAverage() { return lastLoadAverage; } public double getCpuLoad() { return MathUtils.doubleScale(cpuLoad, 2); } public int getAvailableProcessors() { return availableProcessors; } public long getProcessCpuTime() { return processCpuTime; } public double getProcessCpuLoad() { return MathUtils.doubleScale(processCpuLoad, 2); } public String getTemperature() { return temperature + "℃"; } public String getVoltage() { return voltage + "v"; } public int[] getFanSpeeds() { return fanSpeeds; } public int getPhysicalCpu() { return physicalCpu; } public int getLogicalCpu() { return logicalCpu; } public String getUptime() { return FormatUtil.formatElapsedSecs(uptime); } public long getIoWait() { return ioWait; } public long getUserTick() { return userTick; } public long getNiceTick() { return niceTick; } public long getSysTick() { return sysTick; } public long getIdleTick() { return idleTick; } public long getIrqTick() { return irqTick; } public long getSoftIrqTick() { return softIrqTick; } public long getStealTick() { return stealTick; } private CpuInfo() { } /** * 获取cpu使用信息 */ public static CpuInfo getCpuInfo() { CpuInfo cpuInfo = new CpuInfo(); OperatingSystemMXBean osmxb = (OperatingSystemMXBean) ManagementFactory.getOperatingSystemMXBean(); cpuInfo.lastLoadAverage = osmxb.getSystemLoadAverage(); cpuInfo.cpuLoad = osmxb.getSystemCpuLoad(); cpuInfo.availableProcessors = osmxb.getAvailableProcessors(); cpuInfo.processCpuTime = osmxb.getProcessCpuTime(); cpuInfo.processCpuLoad = osmxb.getProcessCpuLoad(); SystemInfo systemInfo = new SystemInfo(); HardwareAbstractionLayer hal = systemInfo.getHardware(); Sensors sensors = hal.getSensors(); cpuInfo.temperature = sensors.getCpuTemperature(); cpuInfo.voltage = sensors.getCpuVoltage(); cpuInfo.fanSpeeds = sensors.getFanSpeeds(); CentralProcessor centralProcessor = hal.getProcessor(); cpuInfo.physicalCpu = centralProcessor.getPhysicalProcessorCount(); cpuInfo.logicalCpu = centralProcessor.getLogicalProcessorCount(); CentralProcessor processor = hal.getProcessor(); cpuInfo.uptime = processor.getSystemUptime(); long[] ticks = processor.getSystemCpuLoadTicks(); long[] prevTicks = processor.getSystemCpuLoadTicks(); cpuInfo.userTick = ticks[TickType.USER.getIndex()] - prevTicks[TickType.USER.getIndex()]; cpuInfo.niceTick = ticks[TickType.NICE.getIndex()] - prevTicks[TickType.NICE.getIndex()]; cpuInfo.sysTick = ticks[TickType.SYSTEM.getIndex()] - prevTicks[TickType.SYSTEM.getIndex()]; cpuInfo.idleTick = ticks[TickType.IDLE.getIndex()] - prevTicks[TickType.IDLE.getIndex()]; cpuInfo.ioWait = ticks[TickType.IOWAIT.getIndex()] - prevTicks[TickType.IOWAIT.getIndex()]; cpuInfo.irqTick = ticks[TickType.IRQ.getIndex()] - prevTicks[TickType.IRQ.getIndex()]; cpuInfo.softIrqTick = ticks[TickType.SOFTIRQ.getIndex()] - prevTicks[TickType.SOFTIRQ.getIndex()]; cpuInfo.stealTick = ticks[TickType.STEAL.getIndex()] - prevTicks[TickType.STEAL.getIndex()]; cpuInfo.loadAverage = processor.getSystemLoadAverage(3); return cpuInfo; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/runtime/DiskInfo.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.runtime; import com.google.common.collect.ImmutableMap; import com.zto.fire.common.util.MathUtils; import oshi.SystemInfo; import oshi.hardware.HWDiskStore; import oshi.hardware.HardwareAbstractionLayer; import oshi.software.os.FileSystem; import oshi.software.os.OSFileStore; import java.util.LinkedList; import java.util.List; import java.util.Map; /** * 用于封装系统磁盘信息 * * @author ChengLong 2019年9月29日 09:36:57 */ public class DiskInfo { /** * 磁盘名称 */ private String name; /** * 磁盘制造商 */ private String model; /** * 磁盘总空间 */ private long total; /** * 磁盘读取总量 */ private long reads; /** * 磁盘写入总量 */ private long writes; /** * 磁盘读/写花费的毫秒数 */ private long transferTime; /** * 磁盘分区信息 */ private static class DiskPartitionInfo { // 分区名称 private String name; // 文件系统类型 private String fileSystem; // 挂载点 private String mount; // 磁盘总空间 private long total; // 磁盘可用空间 private long free; // 磁盘已使用空间 private long used; // 磁盘已用空间的百分比 private double usedPer; // 总的inodes数 private long totalInodes; // 可用的inodes数 private long freeInodes; // 已用的inodes数 private long usedInodes; // 已用的inode百分比 private double usedInodesPer; public DiskPartitionInfo() { } public DiskPartitionInfo(String name, String fileSystem, String mount, long total, long free, long totalInodes, long freeInodes) { this.name = name; this.fileSystem = fileSystem; this.mount = mount; this.total = total; this.free = free; this.used = total - free; this.usedPer = MathUtils.percent(this.used, this.total, 2); this.totalInodes = totalInodes; this.freeInodes = freeInodes; this.usedInodes = totalInodes - freeInodes; this.usedInodesPer = MathUtils.percent(this.usedInodes, this.totalInodes, 2); } public String getName() { return name; } public String getFileSystem() { return fileSystem; } public String getMount() { return mount; } public long getTotal() { return total; } public long getFree() { return free; } public long getUsed() { return used; } public long getTotalInodes() { return totalInodes; } public long getFreeInodes() { return freeInodes; } public long getUsedInodes() { return usedInodes; } public String getUsedPer() { return usedPer + "%"; } public String getUsedInodesPer() { return usedInodesPer + "%"; } } public String getName() { return name; } public String getModel() { return model; } public long getTotal() { return total; } public long getReads() { return reads; } public long getWrites() { return writes; } public long getTransferTime() { return transferTime; } private DiskInfo() { } private DiskInfo(String name, String model, long total, long reads, long writes, long transferTime) { this.name = name; this.model = model; this.total = total; this.reads = reads; this.writes = writes; this.transferTime = transferTime; } /** * 获取磁盘与分区信息 */ public static Map getDiskInfo() { SystemInfo systemInfo = new SystemInfo(); // 获取文件系统信息 FileSystem fileSystem = systemInfo.getOperatingSystem().getFileSystem(); OSFileStore[] fileStores = fileSystem.getFileStores(); List partitionInfoList = new LinkedList<>(); for (OSFileStore fileStore : fileStores) { if (fileStore != null) { partitionInfoList.add(new DiskPartitionInfo(fileStore.getName(), fileStore.getType(), fileStore.getMount(), fileStore.getTotalSpace(), fileStore.getUsableSpace(), fileStore.getTotalInodes(), fileStore.getFreeInodes())); } } // 获取磁盘信息 HardwareAbstractionLayer hal = systemInfo.getHardware(); List diskInfoList = new LinkedList<>(); for (HWDiskStore disk : hal.getDiskStores()) { DiskInfo diskInfo = new DiskInfo(disk.getName(), disk.getModel(), disk.getSize(), disk.getReadBytes(), disk.getWriteBytes(), disk.getTransferTime()); diskInfoList.add(diskInfo); } return ImmutableMap.builder().put("disks", diskInfoList).put("partitions", partitionInfoList).build(); } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/runtime/DisplayInfo.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.runtime; import oshi.SystemInfo; import oshi.hardware.Display; /** * 用于封装显示器相关信息 * @author ChengLong 2019年9月30日 13:36:16 */ public class DisplayInfo { /** * 显示器描述信息 */ private String display; public String getDisplay() { return display; } private DisplayInfo() { } /** * 获取显示器信息 */ public static DisplayInfo getDisplayInfo() { SystemInfo systemInfo = new SystemInfo(); Display[] displays = systemInfo.getHardware().getDisplays(); StringBuilder sb = new StringBuilder(); if (displays != null && displays.length > 0) { for (Display display : displays) { sb.append(display); } } DisplayInfo displayInfo = new DisplayInfo(); displayInfo.display = sb.toString(); return displayInfo; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/runtime/HardwareInfo.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.runtime; import com.zto.fire.common.util.MathUtils; import oshi.SystemInfo; import oshi.hardware.ComputerSystem; import oshi.hardware.HardwareAbstractionLayer; import oshi.hardware.PowerSource; /** * 硬件信息封装类 * * @author ChengLong 2019年9月29日 15:52:50 */ public class HardwareInfo { private static HardwareInfo hardwareInfo = new HardwareInfo(); /** * 制造商 */ private String manufacturer; /** * 型号 */ private String model; /** * 序列号 */ private String serialNumber; /** * 电源信息 */ private String power; /** * 电池容量 */ private String batteryCapacity; public String getManufacturer() { return manufacturer; } public String getModel() { return model; } public String getSerialNumber() { return serialNumber; } public String getPower() { return power; } public String getBatteryCapacity() { return batteryCapacity; } private HardwareInfo() { } /** * 获取硬件设备信息 */ public static HardwareInfo getHardwareInfo() { SystemInfo systemInfo = new SystemInfo(); HardwareAbstractionLayer hardware = systemInfo.getHardware(); ComputerSystem computerSystem = hardware.getComputerSystem(); if (hardwareInfo.manufacturer == null) { hardwareInfo.manufacturer = computerSystem.getManufacturer(); } if (hardwareInfo.model == null) { hardwareInfo.model = computerSystem.getModel(); } if (hardwareInfo.serialNumber == null) { hardwareInfo.serialNumber = computerSystem.getSerialNumber().trim(); } // 获取电源信息 PowerSource[] powerSources = hardware.getPowerSources(); if (powerSources == null || powerSources.length == 0) { hardwareInfo.power = "Unknown"; } else { double timeRemaining = powerSources[0].getTimeRemaining(); if (timeRemaining < -1d) { hardwareInfo.power = "充电中"; } else if (timeRemaining < 0d) { hardwareInfo.power = "计算剩余时间"; } else { hardwareInfo.power = String.format("%d:%02d remaining", (int) (timeRemaining / 3600), (int) (timeRemaining / 60) % 60); } for (PowerSource pSource : powerSources) { hardwareInfo.batteryCapacity = MathUtils.doubleScale(pSource.getRemainingCapacity() * 100d, 2) + ""; } } return hardwareInfo; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/runtime/JvmInfo.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.runtime; import java.io.Serializable; import java.lang.management.*; import java.util.List; /** * Jvm信息包装类,可获取jvm相关信息 * @author ChengLong 2019-9-28 19:38:36 */ public class JvmInfo implements Serializable { private static final long serialVersionUID = 3857878519712626828L; /** * Java版本 */ private String javaVersion; private String javaHome; private String classVersion; /** * jvm可从操作系统申请的最大内存 */ private long memoryMax; /** * jvm已使用操作系统的总内存空间 */ private long memoryTotal; /** * jvm剩余内存空间 */ private long memoryFree; /** * jvm已使用内存空间 */ private long memoryUsed; /** * jvm启动时间,unix时间戳 */ private long startTime; /** * jvm运行时间 */ private long uptime; /** * jvm heap 初始内存大小 */ private long heapInitSize; /** * jvm heap 最大内存空间 */ private long heapMaxSize; /** * jvm heap 已使用空间大小 */ private long heapUseSize; /** * jvm heap 已提交的空间大小 */ private long heapCommitedSize; /** * jvm Non-Heap初始空间 */ private long nonHeapInitSize; /** * jvm Non-Heap最大空间 */ private long nonHeapMaxSize; /** * jvm Non-Heap已使用空间 */ private long nonHeapUseSize; /** * jvm Non-Heap已提交空间 */ private long nonHeapCommittedSize; /** * minor gc 次数 */ private long minorGCCount; /** * minor gc 总耗时 */ private long minorGCTime; /** * full gc 次数 */ private long fullGCCount; /** * full gc 总耗时 */ private long fullGCTime; /** * 虚拟机参数 */ private List jvmOptions; private JvmInfo() {} public long getMemoryMax() { return memoryMax; } public long getMemoryTotal() { return memoryTotal; } public long getMemoryFree() { return memoryFree; } public long getMemoryUsed() { return memoryUsed; } public long getStartTime() { return startTime; } public long getUptime() { return uptime; } public long getHeapInitSize() { return heapInitSize; } public long getHeapMaxSize() { return heapMaxSize; } public long getHeapUseSize() { return heapUseSize; } public long getHeapCommitedSize() { return heapCommitedSize; } public long getNonHeapInitSize() { return nonHeapInitSize; } public long getNonHeapMaxSize() { return nonHeapMaxSize; } public long getNonHeapUseSize() { return nonHeapUseSize; } public long getNonHeapCommittedSize() { return nonHeapCommittedSize; } public String getJavaVersion() { return javaVersion; } public String getJavaHome() { return javaHome; } public String getClassVersion() { return classVersion; } public long getMinorGCCount() { return minorGCCount; } public long getMinorGCTime() { return minorGCTime; } public long getFullGCCount() { return fullGCCount; } public long getFullGCTime() { return fullGCTime; } public List getJvmOptions() { return jvmOptions; } /** * 获取Jvm、类加载器与线程相关信息 */ public static JvmInfo getJvmInfo() { Runtime runtime = Runtime.getRuntime(); JvmInfo jvmInfo = new JvmInfo(); jvmInfo.memoryMax = runtime.maxMemory(); jvmInfo.memoryTotal = runtime.totalMemory(); jvmInfo.memoryFree = runtime.freeMemory(); jvmInfo.memoryUsed = jvmInfo.memoryTotal - jvmInfo.memoryFree; RuntimeMXBean runtimeMXBean = ManagementFactory.getRuntimeMXBean(); jvmInfo.startTime = runtimeMXBean.getStartTime(); jvmInfo.uptime = runtimeMXBean.getUptime(); // 获取jvm heap相关信息 MemoryMXBean memoryMBean = ManagementFactory.getMemoryMXBean(); MemoryUsage heapUsage = memoryMBean.getHeapMemoryUsage(); jvmInfo.heapInitSize = heapUsage.getInit(); jvmInfo.heapMaxSize = heapUsage.getMax(); jvmInfo.heapUseSize = heapUsage.getUsed(); jvmInfo.heapCommitedSize = heapUsage.getCommitted(); // 获取jvm non-heap相关信息 MemoryUsage nonHeapUsage = memoryMBean.getNonHeapMemoryUsage(); jvmInfo.nonHeapInitSize = nonHeapUsage.getInit(); jvmInfo.nonHeapMaxSize = nonHeapUsage.getMax(); jvmInfo.nonHeapUseSize = nonHeapUsage.getUsed(); jvmInfo.nonHeapCommittedSize = nonHeapUsage.getCommitted(); // 获取jvm版本与安装信息 jvmInfo.javaVersion = System.getProperty("java.version"); jvmInfo.javaHome = System.getProperty("java.home"); jvmInfo.classVersion = System.getProperty("java.class.version"); // jvm 参数 jvmInfo.jvmOptions = ManagementFactory.getRuntimeMXBean().getInputArguments(); // 获取gc信息 List gcs = ManagementFactory.getGarbageCollectorMXBeans(); for (GarbageCollectorMXBean gc : gcs) { if (gc.getName().contains("Young") || gc.getName().contains("MarkSweep")) { jvmInfo.minorGCCount = gc.getCollectionCount(); jvmInfo.minorGCTime = gc.getCollectionTime(); } if (gc.getName().contains("Old") || gc.getName().contains("Scavenge")) { jvmInfo.fullGCCount = gc.getCollectionCount(); jvmInfo.fullGCTime = gc.getCollectionTime(); } } return jvmInfo; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/runtime/MemoryInfo.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.runtime; import com.sun.management.OperatingSystemMXBean; import java.io.Serializable; import java.lang.management.ManagementFactory; /** * 用于封装当前系统内存信息 * @author ChengLong 2019-9-28 19:50:22 */ public class MemoryInfo implements Serializable { private static final long serialVersionUID = 7803435486311085016L; /** * 操作系统总内存空间 */ private long total; /** * 操作系统内存剩余空间 */ private long free; /** * 操作系统内存使用空间 */ private long used; /** * 操作系统提交的虚拟内存大小 */ private long commitVirtual; /** * 操作系统交换内存总空间 */ private long swapTotal; /** * 操作系统交换内存剩余空间 */ private long swapFree; /** * 操作系统交换内存已使用空间 */ private long swapUsed; private MemoryInfo() {} public long getTotal() { return total; } public long getFree() { return free; } public long getUsed() { return used; } public long getCommitVirtual() { return commitVirtual; } public long getSwapTotal() { return swapTotal; } public long getSwapFree() { return swapFree; } public long getSwapUsed() { return swapUsed; } /** * 获取内存使用信息 */ public static MemoryInfo getMemoryInfo() { MemoryInfo memoryInfo = new MemoryInfo(); OperatingSystemMXBean osmxb = (OperatingSystemMXBean) ManagementFactory.getOperatingSystemMXBean(); memoryInfo.total = osmxb.getTotalPhysicalMemorySize(); memoryInfo.free = osmxb.getFreePhysicalMemorySize(); memoryInfo.used = memoryInfo.total - memoryInfo.free; memoryInfo.swapTotal = osmxb.getTotalSwapSpaceSize(); memoryInfo.swapFree = osmxb.getFreeSwapSpaceSize(); memoryInfo.swapUsed = memoryInfo.swapTotal - memoryInfo.swapFree; memoryInfo.commitVirtual = osmxb.getCommittedVirtualMemorySize(); return memoryInfo; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/runtime/NetworkInfo.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.runtime; import com.zto.fire.common.util.OSUtils; import oshi.SystemInfo; import oshi.hardware.HardwareAbstractionLayer; import oshi.hardware.NetworkIF; import oshi.software.os.NetworkParams; import java.util.LinkedList; import java.util.List; /** * 网卡信息封装类 * @author ChengLong 2019年9月30日 10:39:08 */ public class NetworkInfo { /** * 网卡名称 */ private String name; /** * 网卡display名称 */ private String displayName; /** * mac地址 */ private String macAddress; /** * 最大传输单元 */ private int mtu; /** * 网卡带宽 */ private long speed; /** * ip v4 地址 */ private String[] ipv4; /** * ip v6 地址 */ private String[] ipv6; /** * ip 地址 */ private String ip; /** * 接收到的数据报个数 */ private long packetsRecv; /** * 发送的数据报个数 */ private long packetsSent; /** * 接收到的数据大小 */ private long bytesRecv; /** * 发送的数据大小 */ private long bytesSent; /** * 主机名 */ private String hostname; /** * 域名称 */ private String domainName; /** * dns */ private String[] dns; /** * ip v4 网关 */ private String ipv4Gateway; /** * ip v6 网关 */ private String ipv6Gateway; public String getName() { return name; } public String getDisplayName() { return displayName; } public String getMacAddress() { return macAddress; } public int getMtu() { return mtu; } public long getSpeed() { return speed; } public String[] getIpv4() { return ipv4; } public String[] getIpv6() { return ipv6; } public String getIp() { return ip; } public long getPacketsRecv() { return packetsRecv; } public long getPacketsSent() { return packetsSent; } public long getBytesRecv() { return bytesRecv; } public long getBytesSent() { return bytesSent; } public String getHostname() { return hostname; } public String getDomainName() { return domainName; } public String[] getDns() { return dns; } public String getIpv4Gateway() { return ipv4Gateway; } public String getIpv6Gateway() { return ipv6Gateway; } private NetworkInfo() {} public static List getNetworkInfo() { SystemInfo systemInfo = new SystemInfo(); HardwareAbstractionLayer hal = systemInfo.getHardware(); NetworkIF[] networkIFS = hal.getNetworkIFs(); List networkInfoList = new LinkedList<>(); if (networkIFS != null && networkIFS.length > 0) { NetworkParams networkParams = systemInfo.getOperatingSystem().getNetworkParams(); for (NetworkIF networkIF : networkIFS) { NetworkInfo networkInfo = new NetworkInfo(); networkInfo.name = networkIF.getName(); networkInfo.displayName = networkIF.getDisplayName(); networkInfo.bytesRecv = networkIF.getBytesRecv(); networkInfo.bytesSent = networkIF.getBytesSent(); networkInfo.packetsRecv = networkIF.getPacketsRecv(); networkInfo.packetsSent = networkIF.getPacketsSent(); networkInfo.ip = OSUtils.getIp(); networkInfo.ipv4 = networkIF.getIPv4addr(); networkInfo.ipv6 = networkIF.getIPv6addr(); networkInfo.mtu = networkIF.getMTU(); networkInfo.speed = networkIF.getSpeed(); networkInfo.macAddress = networkIF.getMacaddr(); networkInfo.hostname = networkParams.getHostName(); networkInfo.domainName = networkParams.getDomainName(); networkInfo.ipv4Gateway = networkParams.getIpv4DefaultGateway(); networkInfo.ipv6Gateway = networkParams.getIpv6DefaultGateway(); networkInfo.dns = networkParams.getDnsServers(); networkInfoList.add(networkInfo); } } return networkInfoList; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/runtime/OSInfo.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.runtime; import com.zto.fire.common.util.OSUtils; import oshi.SystemInfo; import oshi.software.os.OperatingSystem; import oshi.util.FormatUtil; /** * 用于封装操作系统信息 * * @author ChengLong 2019-9-28 19:56:59 */ public class OSInfo { private static OSInfo osInfo = new OSInfo(); /** * 制造商 */ private String manufacturer; /** * 操作系统名称 */ private String name; /** * 操作系统架构 */ private String arch; /** * 操作系统版本 */ private String version; /** * 当前用户 */ private String userName; /** * 当前用户家目录 */ private String userHome; /** * 当前用户工作目录 */ private String userDir; /** * 机器的ip */ private String ip; /** * 集群的主机名 */ private String hostname; /** * 运行时间 */ private String uptime; /** * 组织信息 */ private String family; private OSInfo() { } public String getName() { return name; } public String getArch() { return arch; } public String getVersion() { return version; } public String getUserName() { return userName; } public String getUserHome() { return userHome; } public String getUserDir() { return userDir; } public String getIp() { return ip; } public String getHostname() { return hostname; } public String getManufacturer() { return manufacturer; } public String getUptime() { return uptime; } public String getFamily() { return family; } /** * 获取操作系统相关信息 */ public static OSInfo getOSInfo() { SystemInfo systemInfo = new SystemInfo(); osInfo.name = System.getProperty("os.name"); osInfo.arch = System.getProperty("os.arch"); osInfo.version = System.getProperty("os.version"); osInfo.userName = System.getProperty("user.name"); osInfo.userHome = System.getProperty("user.home"); osInfo.userDir = System.getProperty("user.dir"); osInfo.ip = OSUtils.getIp(); osInfo.hostname = OSUtils.getHostName(); OperatingSystem os = systemInfo.getOperatingSystem(); osInfo.manufacturer = os.getManufacturer(); osInfo.family = os.getFamily(); osInfo.uptime = FormatUtil.formatElapsedSecs(systemInfo.getHardware().getProcessor().getSystemUptime()); return osInfo; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/runtime/RuntimeInfo.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.runtime; import com.zto.fire.common.util.OSUtils; import org.apache.commons.lang3.StringUtils; import java.io.Serializable; /** * 用于获取jvm、os、memory等运行时信息 * * @author ChengLong 2019年9月28日 16:57:03 */ public class RuntimeInfo implements Serializable { private static final long serialVersionUID = 1960438466835847330L; private static RuntimeInfo runtimeInfo = new RuntimeInfo(); /** * jvm运行时信息 */ private JvmInfo jvmInfo; /** * 线程运行时信息 */ private ThreadInfo threadInfo; /** * cpu运行时信息 */ private CpuInfo cpuInfo; /** * 内存运行时信息 */ private MemoryInfo memoryInfo; /** * 类加载器运行时信息 */ private ClassLoaderInfo classLoaderInfo; /** * executor所在ip */ private static String ip; /** * executor所在主机名 */ private static String hostname; /** * 当前pid的进程号 */ private static String pid; /** * executor启动时间(UNIX时间戳) */ private long startTime = System.currentTimeMillis(); private RuntimeInfo() { } public JvmInfo getJvmInfo() { return jvmInfo; } public ThreadInfo getThreadInfo() { return threadInfo; } public CpuInfo getCpuInfo() { return cpuInfo; } public MemoryInfo getMemoryInfo() { return memoryInfo; } public ClassLoaderInfo getClassLoaderInfo() { return classLoaderInfo; } public String getIp() { return ip; } public String getHostname() { return hostname; } public String getPid() { return pid; } public long getStartTime() { return startTime; } public long getUptime() { // executor运行时间(毫秒) return System.currentTimeMillis() - this.startTime; } /** * 获取运行时信息 * * @return 当前运行时信息 */ public static RuntimeInfo getRuntimeInfo() { if (StringUtils.isBlank(ip)) { ip = OSUtils.getIp(); } if (StringUtils.isBlank(hostname)) { hostname = OSUtils.getHostName(); } if (StringUtils.isBlank(pid)) { pid = OSUtils.getPid(); } runtimeInfo.jvmInfo = JvmInfo.getJvmInfo(); runtimeInfo.classLoaderInfo = ClassLoaderInfo.getClassLoaderInfo(); runtimeInfo.threadInfo = ThreadInfo.getThreadInfo(); runtimeInfo.cpuInfo = CpuInfo.getCpuInfo(); runtimeInfo.memoryInfo = MemoryInfo.getMemoryInfo(); return runtimeInfo; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/runtime/ThreadInfo.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.runtime; import com.sun.management.ThreadMXBean; import java.io.Serializable; import java.lang.management.ManagementFactory; /** * 用于包装运行时线程信息 * @author ChengLong 2019-9-28 19:36:52 */ public class ThreadInfo implements Serializable { private static final long serialVersionUID = 7950498675819426939L; /** * 当前线程的总 CPU 时间(以毫微秒为单位) */ private long cpuTime; /** * 当前线程的总用户cpu时间(以毫微秒为单位) */ private long userTime; /** * 当前守护线程的总数 */ private int deamonCount; /** * 返回自从 Java 虚拟机启动或峰值重置以来峰值活动线程计数 */ private int peakCount; /** * 返回当前线程的总数,包括守护线程和非守护线程 */ private int totalCount; /** * 返回自从 Java 虚拟机启动以来创建和启动的线程总数目 */ private long totalStartedCount; private ThreadInfo() {} public long getCpuTime() { return cpuTime; } public long getUserTime() { return userTime; } public int getDeamonCount() { return deamonCount; } public int getPeakCount() { return peakCount; } public int getTotalCount() { return totalCount; } public long getTotalStartedCount() { return totalStartedCount; } /** * 获取线程相关信息 */ public static ThreadInfo getThreadInfo() { ThreadInfo threadInfo = new ThreadInfo(); ThreadMXBean threadMBean = (ThreadMXBean) ManagementFactory.getThreadMXBean(); threadInfo.cpuTime = threadMBean.getCurrentThreadCpuTime(); threadInfo.userTime = threadMBean.getCurrentThreadUserTime(); threadInfo.deamonCount = threadMBean.getDaemonThreadCount(); threadInfo.peakCount = threadMBean.getPeakThreadCount(); threadInfo.totalCount = threadMBean.getThreadCount(); threadInfo.totalStartedCount = threadMBean.getTotalStartedThreadCount(); return threadInfo; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/bean/runtime/UsbInfo.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean.runtime; import oshi.SystemInfo; import oshi.hardware.UsbDevice; import java.util.LinkedList; import java.util.List; /** * 用于封装usb设备信息 * @author ChengLong 2019年9月30日 13:33:35 */ public class UsbInfo { /** * usb 设备名称 */ private String name; /** * usb设备id */ private String productId; /** * usb设备制造商 */ private String vendor; /** * usb设备制造商id */ private String vendorId; /** * usb设备序列号 */ private String serialNumber; public String getName() { return name; } public String getProductId() { return productId; } public String getVendor() { return vendor; } public String getVendorId() { return vendorId; } public String getSerialNumber() { return serialNumber; } private UsbInfo() {} public UsbInfo(String name, String productId, String vendor, String vendorId, String serialNumber) { this.name = name; this.productId = productId; this.vendor = vendor; this.vendorId = vendorId; this.serialNumber = serialNumber; } /** * 获取usb社保信息 */ public static List getUsbInfo() { SystemInfo systemInfo = new SystemInfo(); UsbDevice[] usbDevices = systemInfo.getHardware().getUsbDevices(true); List usbInfoList = new LinkedList<>(); if (usbDevices != null && usbDevices.length > 0) { for (UsbDevice usbDevice : usbDevices) { UsbInfo usbInfo = new UsbInfo(usbDevice.getName(), usbDevice.getProductId(), usbDevice.getVendor(), usbDevice.getVendorId(), usbDevice.getSerialNumber()); usbInfoList.add(usbInfo); } } return usbInfoList; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/enu/ConfigureLevel.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.enu; /** * 用于定义配置的级别 * * @author ChengLong 2021-8-23 16:29:29 * @since 2.2.0 */ public enum ConfigureLevel { FRAMEWORK(10), // 框架级别配置,通用的配置信息 TASK(20), // 任务级别配置,每个任务单独的配置 URGENT(30); // 紧急配置,优先级高于用户级别配置 ConfigureLevel(int level) { } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/enu/Datasource.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.enu; import org.apache.commons.lang3.StringUtils; /** * 数据源类型 * * @author ChengLong * @create 2020-07-07 16:36 * @since 2.0.0 */ public enum Datasource { HIVE(1), HBASE(2), KAFKA(3), ROCKETMQ(4), REDIS(5), ES(6), MYSQL(7), TIDB(8), ORACLE(9), SQLSERVER(10), DB2(11), CLICKHOUSE(12), PRESTO(13), KYLIN(14), DERBY(15), VIEW(16), JDBC(17), FIRE_ROCKETMQ(18), UNKNOWN(404); Datasource(int type) { } /** * 将字符串解析成指定的枚举类型 */ public static Datasource parse(String dataSource) { if (StringUtils.isBlank(dataSource)) return UNKNOWN; try { String trimDatasource = dataSource.replace("-", "_"); return Enum.valueOf(Datasource.class, trimDatasource.trim().toUpperCase()); } catch (Exception e) { return UNKNOWN; } } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/enu/ErrorCode.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.enu; /** * 系统预定义错误码 * @author ChengLong 2018年6月12日 13:39:50 */ public enum ErrorCode { SUCCESS, ERROR, PARAM_ILLEGAL, NOT_FOUND, IS_EXISTS, NOT_LOGIN, TIME_OUT, GONE, UNAUTHORIZED } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/enu/JdbcDriver.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.enu; /** * 用于枚举常见数据库的jdbc驱动类 * * @author ChengLong 2022-04-26 14:58:17 */ public enum JdbcDriver { mysql("com.mysql.jdbc.Driver"), tidb("com.mysql.jdbc.Driver"), sqlserver("com.microsoft.sqlserver.jdbc.SQLServerDriver"), oracle("oracle.jdbc.driver.OracleDriver"), hive("org.apache.hive.jdbc.HiveDriver"), presto("com.facebook.presto.jdbc.PrestoDriver"), spark("org.apache.hive.jdbc.HiveDriver"), clickhouse("ru.yandex.clickhouse.ClickHouseDriver"), postgreSql("org.postgresql.Driver"), impala("com.cloudera.impala.jdbc41.Driver"), automatic(""); private String driver; JdbcDriver(String driver) { this.driver = driver; } public String getDriver() { return driver; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/enu/JobType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.enu; /** * Fire任务类型 * * @author ChengLong 2019-7-26 11:06:38 */ public enum JobType { SPARK_CORE("spark_core"), SPARK_STREAMING("spark_streaming"), SPARK_STRUCTURED_STREAMING("spark_structured_streaming"), SPARK_SQL("spark_sql"), FLINK_STREAMING("flink_streaming"), FLINK_BATCH("flink_batch"), UNDEFINED("undefined"); /** * 任务类型 */ private String jobTypeDesc; JobType(String jobType) { this.jobTypeDesc = jobType; } /** * 获取当前任务的类型 * * @return */ public String getJobTypeDesc() { return this.jobTypeDesc; } /** * 用于判断当前任务是否为spark任务 * * @return true: spark任务 false:非spark任务 */ public boolean isSpark() { return this.jobTypeDesc.contains("spark"); } /** * 用于判断当前任务是否为flink任务 * * @return true: flink任务 false:非flink任务 */ public boolean isFlink() { return this.jobTypeDesc.contains("flink"); } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/enu/Operation.java ================================================ package com.zto.fire.common.enu; import org.apache.commons.lang3.StringUtils; /** * SQL的操作类型 * * @author ChengLong 2021年6月17日13:12:07 * @since 2.0.0 */ public enum Operation { SELECT(1), DROP_TABLE(2), RENAME_TABLE_OLD(3), RENAME_TABLE_NEW(4), CREATE_TABLE(5), CREATE_TABLE_AS_SELECT(6), CREATE_VIEW(7), REPLACE_TABLE(8), REPLACE_TABLE_AS_SELECT(9), RENAME_PARTITION_OLD(10), RENAME_PARTITION_NEW(11), DROP_PARTITION(12), TRUNCATE(13), CACHE(14), UNCACHE(15), REFRESH(16), CREATE_DATABASE(17), DROP_DATABASE(18), ADD_PARTITION(19), ALTER_TABLE(20), INSERT_INTO(21), INSERT_OVERWRITE(22), INSERT(23), SOURCE(24), SINK(25), GET(26), SCAN(27), ENABLE_TABLE(28), DISABLE_TABLE(29), DELETE(30), DELETE_FAMILY(31), DELETE_QUALIFIER(32), UPDATE(33), UNKNOWN(404); Operation(int type) { } /** * 将字符串解析成指定的枚举类型 */ public static Operation parse(String operation) { if (StringUtils.isBlank(operation)) return UNKNOWN; try { return Enum.valueOf(Operation.class, operation.trim().toUpperCase()); } catch (Exception e) { return UNKNOWN; } } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/enu/RequestMethod.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.enu /** * 定义http请求的方式枚举 * * @author ChengLong 2019-3-16 10:27:11 */ object RequestMethod extends Enumeration { type RequestMethod = Value val GET = Value("get") val POST = Value("post") val DELETE = Value("delete") val PUT = Value("put") } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/enu/ThreadPoolType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.enu; /** * 线程池类型 * @author ChengLong 2019年10月18日 14:33:52 */ public enum ThreadPoolType { FIXED, SINGLE, CACHED, SCHEDULED, WORK_STEALING } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/enu/YarnState.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.enu; import org.apache.commons.lang3.StringUtils; /** * yarn的job状态 * * @author ChengLong 2019-5-16 09:19:56 */ public enum YarnState { RUNNING("running"), ACCEPTED("accepted"), SUBMITTED("submitted"), FINISHED("finished"), FAILED("failed"), KILLED("killed"), UNDEFINED("undefined"), NULL(""), UNKONOW("unknow"); // 状态信息 private final String state; YarnState(String state) { this.state = state; } public String getState() { return state; } /** * 根据状态字符串返回状态枚举 * * @param state 状态 * @return */ public static YarnState getState(String state) { if (StringUtils.isBlank(state)) { return NULL; } switch (state.toLowerCase()) { case "running": return RUNNING; case "accepted": return ACCEPTED; case "submitted": return SUBMITTED; case "finished": return FINISHED; case "failed": return FAILED; case "killed": return KILLED; default: return NULL; } } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/exception/FireException.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.exception; /** * Fire框架异常对象 * * @author ChengLong 2022-08-01 09:37:02 * @since 2.3.2 */ public class FireException extends Exception { public FireException() { super(); } public FireException(String message) { super(message); } public FireException(String message, Throwable cause) { super(message, cause); } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/exception/FireFlinkException.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.exception; /** * Fire-spark异常对象 * * @author ChengLong 2022-08-01 09:37:02 * @since 2.3.2 */ public class FireFlinkException extends FireException { public FireFlinkException() { super(); } public FireFlinkException(String message) { super(message); } public FireFlinkException(String message, Throwable cause) { super(message, cause); } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/exception/FireSparkException.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.exception; /** * Fire-spark异常对象 * * @author ChengLong 2022-08-01 09:37:02 * @since 2.3.2 */ public class FireSparkException extends FireException { public FireSparkException() { super(); } public FireSparkException(String message) { super(message); } public FireSparkException(String message, Throwable cause) { super(message, cause); } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/util/EncryptUtils.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util; import com.zto.fire.common.conf.FireFrameworkConf; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import sun.misc.BASE64Decoder; import sun.misc.BASE64Encoder; import java.math.BigInteger; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.Objects; /** * 各种常用算法加密工具类 * * @author ChengLong 2018年7月16日 09:53:59 */ public class EncryptUtils { private static final String ERROR_MESSAGE = "参数不合法"; private static final Logger logger = LoggerFactory.getLogger(EncryptUtils.class); private EncryptUtils() {} /** * BASE64解密 */ public static String base64Decrypt(String message) { Objects.requireNonNull(message, ERROR_MESSAGE); try { return new String((new BASE64Decoder()).decodeBuffer(message), StandardCharsets.UTF_8); } catch (Exception e) { logger.error("BASE64解密出错", e); } return ""; } /** * BASE64加密 */ public static String base64Encrypt(String message) { Objects.requireNonNull(message, ERROR_MESSAGE); try { return new BASE64Encoder().encodeBuffer(message.getBytes()); } catch (Exception e) { logger.error("BASE64加密出错", e); } return ""; } /** * 生成32位md5码 */ public static String md5Encrypt(String message) { Objects.requireNonNull(message, ERROR_MESSAGE); try { // 得到一个信息摘要器 MessageDigest digest = MessageDigest.getInstance("md5"); byte[] result = digest.digest(message.getBytes(StandardCharsets.UTF_8)); StringBuilder buffer = new StringBuilder(); for (byte b : result) { int number = b & 0xff;// 加盐 String str = Integer.toHexString(number); if (str.length() == 1) { buffer.append('0'); } buffer.append(str); } // 标准的md5加密后的结果 return buffer.toString(); } catch (NoSuchAlgorithmException e) { logger.error("生成32位md5码出错", e); } return ""; } /** * SHA加密 */ public static String shaEncrypt(String message, String key) { Objects.requireNonNull(message, ERROR_MESSAGE); if(StringUtils.isBlank(key)) { key = "SHA"; } try { MessageDigest sha = MessageDigest.getInstance(key); sha.update(message.getBytes(StandardCharsets.UTF_8)); return new BigInteger(sha.digest()).toString(32); } catch (Exception e) { logger.error("生成SHA加密出错", e); } return ""; } /** * header权限校验 * @param auth * 请求json * @return * true:身份合法 false:身份非法 */ public static boolean checkAuth(String auth, String privateKey) { if (StringUtils.isBlank(auth)) { return false; } String fireAuth = EncryptUtils.md5Encrypt(FireFrameworkConf.restServerSecret() + privateKey); return fireAuth.equals(auth); } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/util/FileUtils.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util; import java.io.File; import java.io.InputStream; import java.util.List; import java.util.Objects; /** * 文件操作工具类 * * @author ChengLong 2018年8月22日 13:10:03 */ public class FileUtils { private FileUtils() {} /** * 递归查找指定目录下的文件 * * @param path 路径 * @param fileName 文件名 * @return 文件全路径 */ public static File findFile(String path, String fileName, List fileList) { File searchFile = null; File dir = new File(path); if (dir.exists() && dir.isDirectory()) { for (File file : Objects.requireNonNull(dir.listFiles())) { if (file.isDirectory()) { searchFile = findFile(file.getPath(), fileName, fileList); } else { if (file.getName().equals(fileName)) { searchFile = file; break; } } } } if (searchFile != null) { fileList.add(searchFile); } return searchFile; } /** * 判断resource路径下的文件是否存在 * * @param fileName 配置文件名称 * @return null: 不存在,否则为存在 */ public static InputStream resourceFileExists(String fileName) { return FileUtils.class.getClassLoader().getResourceAsStream(fileName); } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/util/FindClassUtils.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.io.Serializable; import java.net.JarURLConnection; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.Enumeration; import java.util.LinkedList; import java.util.List; import java.util.jar.JarEntry; import java.util.jar.JarFile; /** * 查找指定包下所有的类 * Created by ChengLong on 2018-03-23. */ public class FindClassUtils { // 接口类class 用于过滤 private static Class superStrategy = Serializable.class; // 默认使用的类加载器 private static ClassLoader classLoader = FindClassUtils.class.getClassLoader(); private static final Logger logger = LoggerFactory.getLogger(FindClassUtils.class); private static final String CLASS_FILE = ".class"; private FindClassUtils() { } /** * 获取包下所有实现了superStrategy的类并加入list */ public static List> listPackageClasses(String... packageNames) { List> classList = new ArrayList<>(); if (packageNames != null && packageNames.length > 0) { for (String packageName : packageNames) { if (StringUtils.isNotBlank(packageName) && packageName.contains(".")) { URL url = FindClassUtils.classLoader.getResource(packageName.replace('.', '/')); String protocol = url.getProtocol(); if ("file".equals(protocol)) { // 本地自己可见的代码 FindClassUtils.findClassLocal(packageName, classList); } else if ("jar".equals(protocol)) { // 引用jar包的代码 FindClassUtils.findClassJar(packageName, classList); } } } } return classList; } /** * 本地查找 * * @param packName 包名 */ private static void findClassLocal(final String packName, final List> list) { URI url = null; try { url = FindClassUtils.classLoader.getResource(packName.replace('.', '/')).toURI(); File file = new File(url); file.listFiles(chiFile -> { if (chiFile.isDirectory()) { FindClassUtils.findClassLocal(packName + "." + chiFile.getName(), list); } if (chiFile.getName().endsWith(CLASS_FILE)) { Class clazz = null; try { clazz = FindClassUtils.classLoader.loadClass(packName + "." + chiFile.getName().replace(CLASS_FILE, "")); } catch (ClassNotFoundException e) { logger.error("未找到类异常", e); } if (FindClassUtils.superStrategy.isAssignableFrom(clazz)) { list.add((Class) clazz); } return true; } return false; }); } catch (URISyntaxException e1) { logger.error("未找到相关资源", e1); } } /** * 从jar包中查找指定包下的文件 * * @param packName 包名 */ private static void findClassJar(final String packName, final List> list) { String pathName = packName.replace('.', '/'); JarFile jarFile = null; try { URL url = FindClassUtils.classLoader.getResource(pathName); JarURLConnection jarURLConnection = (JarURLConnection) url.openConnection(); jarFile = jarURLConnection.getJarFile(); Enumeration jarEntries = jarFile.entries(); while (jarEntries.hasMoreElements()) { JarEntry jarEntry = jarEntries.nextElement(); String jarEntryName = jarEntry.getName(); if (jarEntryName.contains(pathName) && !jarEntryName.equals(pathName + "/")) { // 递归遍历子目录 if (jarEntry.isDirectory()) { String clazzName = jarEntry.getName().replace('/', '.'); int endIndex = clazzName.lastIndexOf('.'); String prefix = null; if (endIndex > 0) { prefix = clazzName.substring(0, endIndex); } findClassJar(prefix, list); } if (jarEntry.getName().endsWith(CLASS_FILE)) { Class clazz = FindClassUtils.classLoader.loadClass(jarEntry.getName().replace('/', '.').replace(CLASS_FILE, "")); if (FindClassUtils.superStrategy.isAssignableFrom(clazz)) { list.add((Class) clazz); } } } } } catch (Exception e) { logger.error("未在jar包中找到相关文件", e); } finally { try { if (jarFile != null) { jarFile.close(); } } catch (Exception e) { logger.error("关闭jarFile对象失败"); } } } /** * 用于判断当前以jar方式运行还是以idea方式运行 * * @return true:jar方式 false:idea运行 */ public static boolean isJar() { URL url = FindClassUtils.class.getProtectionDomain().getCodeSource().getLocation(); return url.getPath().endsWith(".jar"); } /** * 获取指定文件名在jar包中的位置,兼容非jar包 * * @param fileName 文件名 * @return 路径名+文件名 */ public static String findFileInJar(String fileName) { if (StringUtils.isBlank(fileName)) { return null; } String fullName = ""; URL url = FindClassUtils.class.getProtectionDomain().getCodeSource().getLocation(); if (url.getPath().endsWith(".jar")) { try (JarFile jarFile = new JarFile(url.getFile())) { Enumeration entrys = jarFile.entries(); while (entrys.hasMoreElements()) { JarEntry jar = entrys.nextElement(); String name = jar.getName(); if (name.endsWith("/" + fileName)) { fullName = name; break; } } } catch (IOException e) { logger.error("从jar包中查找文件过程中报错", e); } } else { // 在IDEA中执行 try { List searchList = new LinkedList<>(); FileUtils.findFile(FindClassUtils.class.getResource("/").getPath(), fileName, searchList); if (!searchList.isEmpty()) { fullName = searchList.get(0).getPath(); } } catch (Exception ex) { logger.error("从project中查找文件过程中报错", ex); } } return fullName; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/util/HttpClientUtils.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util; import org.apache.commons.httpclient.*; import org.apache.commons.httpclient.methods.*; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; /** * HTTP接口调用,各模块继承自该类 * Created by ChengLong on 2017-12-12. */ public class HttpClientUtils { private static final String CHARSET = "UTF-8"; private static final String HEADER_JSON_VALUE = "application/json"; private static final Logger logger = LoggerFactory.getLogger(HttpClientUtils.class); private HttpClientUtils() { } /** * 添加header请求信息 * * @param method 请求的方式 * @param headers 请求头信息 */ private static void setHeaders(HttpMethodBase method, Header... headers) { if (method != null && headers != null && headers.length > 0) { for (Header header : headers) { if (header != null) { method.setRequestHeader(header); } } } } /** * 以流的方式获取返回的消息体 */ private static String responseBody(HttpMethodBase method) throws IOException { if (method == null) { return ""; } StringBuilder stringBuffer = new StringBuilder(); BufferedReader reader = new BufferedReader(new InputStreamReader(method.getResponseBodyAsStream())); String str = ""; while ((str = reader.readLine()) != null) { stringBuffer.append(str); } return stringBuffer.toString(); } /** * HTTP通用接口调用(Get请求) * * @param url 地址 * @return 调用结果 */ public static String doGet(String url, Header... headers) throws IOException { String responseBody = ""; GetMethod getMethod = new GetMethod(); HttpClient httpClient = new HttpClient(); // 设置 get 请求超时为 5 秒 getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 3000); // 设置请求重试处理,用的是默认的重试处理:请求三次 getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler()); // 设置请求头 setHeaders(getMethod, headers); getMethod.setURI(new URI(url, true, CHARSET)); int statusCode = httpClient.executeMethod(getMethod); // 判断访问的状态码 if (statusCode != HttpStatus.SC_OK) { logger.error("请求出错: {}", getMethod.getStatusLine()); } // 读取 HTTP 响应内容,这里简单打印网页内容 responseBody = responseBody(getMethod); getMethod.releaseConnection(); httpClient.getHttpConnectionManager().closeIdleConnections(0); return responseBody; } /** * HTTP通用接口调用(Post请求) * * @param url 地址 * @return 调用结果 */ public static String doPost(String url, String json, Header... headers) throws IOException { String responses = ""; PostMethod postMethod = new PostMethod(); HttpClient httpClient = new HttpClient(); postMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 3000); postMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler()); // 设置请求头 setHeaders(postMethod, headers); postMethod.setURI(new URI(url, true, CHARSET)); postMethod.addRequestHeader("Content-Type", HEADER_JSON_VALUE); if (json != null && StringUtils.isNotBlank(json.trim())) { RequestEntity requestEntity = new StringRequestEntity(json, HEADER_JSON_VALUE, CHARSET); postMethod.setRequestHeader("Content-Length", String.valueOf(requestEntity.getContentLength())); postMethod.setRequestEntity(requestEntity); } httpClient.executeMethod(postMethod); responses = responseBody(postMethod); postMethod.releaseConnection(); httpClient.getHttpConnectionManager().closeIdleConnections(0); return responses; } /** * 发送一次post请求到指定的地址,不向上抛出异常 * * @param url 接口地址 * @return 调用结果 */ public static String doPut(String url, String json, Header... headers) throws IOException { String responseBody = ""; PutMethod putMethod = new PutMethod(); HttpClient htpClient = new HttpClient(); putMethod.setURI(new URI(url, true, CHARSET)); putMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 3000); putMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler()); // 设置请求头 setHeaders(putMethod, headers); if (json != null && StringUtils.isNotBlank(json.trim())) { RequestEntity requestEntity = new StringRequestEntity(json, HEADER_JSON_VALUE, CHARSET); putMethod.setRequestHeader("Content-Length", String.valueOf(requestEntity.getContentLength())); putMethod.setRequestEntity(requestEntity); } int statusCode = htpClient.executeMethod(putMethod); if (statusCode != HttpStatus.SC_OK) { return ""; } responseBody = responseBody(putMethod); putMethod.releaseConnection(); htpClient.getHttpConnectionManager().closeIdleConnections(0); return responseBody; } /** * 发送一次get请求到指定的地址,不向上抛出异常 * * @param url 接口地址 * @return 调用结果 */ public static String doGetIgnore(String url, Header... headers) { String response = ""; try { response = doGet(url, headers); } catch (Exception e) { logger.error("HTTP通用接口调用(Get)失败", e); } return response; } /** * 发送一次post请求到指定的地址,不向上抛出异常 * * @param url 接口地址 * @return 调用结果 */ public static String doPostIgnore(String url, String json, Header... headers) { String response = ""; try { response = doPost(url, json, headers); } catch (Exception e) { logger.error("HTTP通用接口调用(Post)失败", e); } return response; } /** * 发送一次put请求到指定的地址,不向上抛出异常 * * @param url 接口地址 * @return 调用结果 */ public static String doPutIgnore(String url, String json, Header... headers) { String response = ""; try { response = doPut(url, json, headers); } catch (Exception e) { logger.error("HTTP通用接口调用(Put)失败", e); } return response; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/util/IOUtils.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.Closeable; /** * io流工具类 * * @author ChengLong 2019-3-27 11:17:56 */ public class IOUtils { private static final Logger logger = LoggerFactory.getLogger(IOUtils.class); private IOUtils() {} /** * 关闭多个流 */ public static void close(Closeable... closeables) { if (closeables != null && closeables.length > 0) { for (Closeable io : closeables) { try { if (io != null) { io.close(); } } catch (Exception e) { logger.error("close 对象失败", e); } } } } /** * 关闭多个process对象 */ public static void close(Process... process) { if (process != null && process.length > 0) { for (Process pro : process) { try { if (pro != null) { pro.destroy(); } } catch (Exception e) { logger.error("close process 对象失败", e); } } } } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/util/MathUtils.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util; import java.math.*; /** * 数据计算工具类 * * @author ChengLong 2019年9月29日 13:50:31 */ public class MathUtils { private MathUtils() {} /** * 计算百分比,并保留指定的小数位 * * @param molecule 分子 * @param denominator 分母 * @param scale 精度 * @return 百分比 */ public static double percent(long molecule, long denominator, int scale) { if (molecule == 0 || denominator == 0) { return 0.00; } return BigDecimal.valueOf(100.00 * molecule / denominator).setScale(scale, RoundingMode.HALF_UP).doubleValue(); } /** * 将指定double类型数据以四舍五入的方式保留指定的精度 * * @param data 数据 * @param scale 精度 * @return 四舍五入后的数据 */ public static double doubleScale(double data, int scale) { return BigDecimal.valueOf(data).setScale(scale, RoundingMode.HALF_UP).doubleValue(); } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/util/OSUtils.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.lang.management.ManagementFactory; import java.net.InetAddress; import java.net.NetworkInterface; import java.net.ServerSocket; import java.util.Enumeration; import java.util.Random; /** * 用于获取服务器负载信息,包括磁盘io、cpu负载、内存使用、网络使用等等 * 注:使用此工具需预先安装:sudo yum install sysstat * * @author ChengLong 2019-04-08 13:57:31 */ public class OSUtils { private static String ip; private static String hostname; private static String pid; private static Random random = new Random(); private static final String OSNAME = "os.name"; private static final Logger logger = LoggerFactory.getLogger(OSUtils.class); private OSUtils() { } /** * 获取主机地址信息 */ public static InetAddress getHostLANAddress() { try { InetAddress candidateAddress = null; // 遍历所有的网络接口 for (Enumeration ifaces = NetworkInterface.getNetworkInterfaces(); ifaces.hasMoreElements(); ) { NetworkInterface iface = ifaces.nextElement(); // 在所有的接口下再遍历IP for (Enumeration inetAddrs = iface.getInetAddresses(); inetAddrs.hasMoreElements(); ) { InetAddress inetAddr = inetAddrs.nextElement(); if (!inetAddr.isLoopbackAddress()) { // 排除loopback类型地址 if (inetAddr.isSiteLocalAddress()) { // 如果是site-local地址 return inetAddr; } else if (candidateAddress == null) { // site-local类型的地址未被发现,先记录候选地址 candidateAddress = inetAddr; } } } } if (candidateAddress != null) { return candidateAddress; } // 如果没有发现 non-loopback地址.只能用最次选的方案 return InetAddress.getLocalHost(); } catch (Exception e) { logger.error("获取主机地址信息失败", e); } return null; } /** * 获取本机的ip地址 * * @return ip地址 */ public static String getIp() { if (StringUtils.isBlank(ip)) { InetAddress inetAddress = getHostLANAddress(); if (inetAddress != null) { ip = inetAddress.getHostAddress(); } } return ip; } /** * 获取本机的hostname * * @return hostname */ public static String getHostName() { if (StringUtils.isBlank(hostname)) { InetAddress inetAddress = getHostLANAddress(); if (inetAddress != null) { hostname = inetAddress.getHostName(); } } return hostname; } /** * 随机获取系统未被使用的端口号 */ public static int getRundomPort() { int port = 0; try (ServerSocket socket = new ServerSocket(0)) { port = socket.getLocalPort(); logger.debug("成功获取随机端口号:{}", port); } catch (Exception e) { logger.error("端口号{}已被占用,尝试扫描新的未被占用的端口号."); } return port; } /** * 获取当前进程的pid * * @return pid */ public static String getPid() { if (StringUtils.isBlank(pid)) { pid = ManagementFactory.getRuntimeMXBean().getName().split("@")[0]; } return pid; } /** * 判断当前运行环境是否为linux */ public static boolean isLinux() { return System.getProperty(OSNAME).toLowerCase().contains("linux"); } /** * 判断当前运行环境是否为windows */ public static boolean isWindows() { String os = System.getProperty(OSNAME); return os.toLowerCase().startsWith("windows"); } /** * 判断当前是否运行在本地环境下 * 本地环境包括:Windows、Mac OS */ public static boolean isLocal() { return isWindows() || isMac(); } /** * 是否为mac os环境 */ public static boolean isMac() { String os = System.getProperty(OSNAME); return os.toLowerCase().contains("mac"); } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/util/ProcessUtil.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.BufferedReader; import java.io.InputStreamReader; import java.util.Objects; /** * 执行命令的工具 * * @author ChengLong 2019-4-10 15:50:23 */ public class ProcessUtil { private static final Logger logger = LoggerFactory.getLogger(ProcessUtil.class); private ProcessUtil() {} /** * 执行多条linux命令,不返回命令执行日志 * * @param commands linux命令 * @return 命令执行结果的一行数据 */ public static void executeCmds(String... commands) { Objects.requireNonNull(commands, "命令不能为空"); for (String command : commands) { executeCmdForLine(command); } } /** * 执行一条linux命令,仅返回命令的一行 * * @param cmd linux命令 * @return 命令执行结果的一行数据 */ public static String executeCmdForLine(String cmd) { if (!OSUtils.isLinux() || StringUtils.isBlank(cmd)) { // 如果是windows环境 return " "; } Process process = null; BufferedReader reader = null; String result = ""; try { process = Runtime.getRuntime().exec(cmd); reader = new BufferedReader(new InputStreamReader(process.getInputStream())); String line = ""; while ((line = reader.readLine()) != null) { if (StringUtils.isNotBlank(line)) { result = line; } } } catch (Exception e) { logger.error("执行命令报错", e); } finally { IOUtils.close(process); IOUtils.close(reader); } return result; } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/util/ReflectionUtils.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util; import com.zto.fire.common.conf.FirePS1Conf; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.lang.annotation.Annotation; import java.lang.annotation.ElementType; import java.lang.reflect.Field; import java.lang.reflect.Method; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import static com.zto.fire.common.util.UnitFormatUtils.readable; /** * 反射工具类,获取各元素信息后缓存到map中 * Created by ChengLong on 2017-03-30. */ public class ReflectionUtils { private static final Map, Map> cacheFieldMap = new ConcurrentHashMap<>(); private static final Map, Map> cacheMethodMap = new ConcurrentHashMap<>(); private static final Logger logger = LoggerFactory.getLogger(ReflectionUtils.class); private ReflectionUtils() { } public static void setAccessible(Field field) { if (field != null) { field.setAccessible(true); } } public static void setAccessible(Method method) { if (method != null) { method.setAccessible(true); } } /** * 根据类名反射获取Class对象 */ public static Class forName(String className) { try { return Class.forName(className); } catch (Exception e) { logger.error("未找到类信息:" + className, e); return null; } } /** * 用于判断某类是否存在指定的字段 */ public static boolean containsField(Class clazz, String fieldName) { Field field = getFieldByName(clazz, fieldName); return field != null ? true : false; } /** * 获取所有公有字段,并返回Map */ private static Map getFields(Class clazz) { if (clazz == null) { return Collections.emptyMap(); } Field[] fields = clazz.getFields(); Map fieldMap = new HashMap<>(fields.length); for (Field field : fields) { fieldMap.put(field.getName(), field); } return fieldMap; } /** * 获取所有声明字段,并返回Map */ private static Map getDeclaredFields(Class clazz) { if (clazz == null) { return Collections.emptyMap(); } Field[] fields = clazz.getDeclaredFields(); Map fieldMap = new HashMap<>(fields.length); for (Field field : fields) { setAccessible(field); fieldMap.put(field.getName(), field); } return fieldMap; } /** * 获取所有字段,含私有和继承而来的,并返回Map */ public static Map getAllFields(Class clazz) { if (!cacheFieldMap.containsKey(clazz)) { Map fieldMap = new HashMap<>(); fieldMap.putAll(getFields(clazz)); fieldMap.putAll(getDeclaredFields(clazz)); cacheFieldMap.put(clazz, fieldMap); } return cacheFieldMap.get(clazz); } /** * 根据成员变量名称获取Filed类型(从缓存中获取) */ public static Field getFieldByName(Class clazz, String fieldName) { return getAllFields(clazz).get(fieldName); } /** * 获取所有方法,含私有和继承而来的,并返回Map */ public static Map getAllMethods(Class clazz) { if (!cacheMethodMap.containsKey(clazz)) { Map methodMap = new HashMap<>(); methodMap.putAll(getMethods(clazz)); methodMap.putAll(getDeclaredMethods(clazz)); cacheMethodMap.put(clazz, methodMap); } return cacheMethodMap.get(clazz); } /** * 根据方法名称获取Method类型(从缓存中获取) * * @param clazz 类类型 * @param methodName 方法名称 * @return Method */ public static Method getMethodByName(Class clazz, String methodName) { return getAllMethods(clazz).get(methodName); } /** * 根据方法名称获取Method类型(从缓存中获取) * * @param className 类名 * @param methodName 方法名称 * @return Method */ public static Method getMethodByName(String className, String methodName) { return getAllMethods(forName(className)).get(methodName); } /** * 用于判断某类是否存在指定的方法名 */ public static boolean containsMethod(Class clazz, String methodName) { Method method = getMethodByName(clazz, methodName); return method != null ? true : false; } /** * 获取所有公有方法,并返回Map */ private static Map getMethods(Class clazz) { if (clazz == null) { return Collections.emptyMap(); } Method[] methods = clazz.getMethods(); Map methodMap = new HashMap<>(methods.length); for (Method method : methods) { methodMap.put(method.getName(), method); } return methodMap; } /** * 获取所有声明方法,并返回Map */ private static Map getDeclaredMethods(Class clazz) { if (clazz == null) { return Collections.emptyMap(); } Method[] methods = clazz.getDeclaredMethods(); Map methodMap = new HashMap<>(methods.length); for (Method method : methods) { setAccessible(method); methodMap.put(method.getName(), method); } return methodMap; } /** * 获取指定field的类型 */ public static Class getFieldType(Class clazz, String fieldName) { if (clazz == null || StringUtils.isBlank(fieldName)) { return null; } try { Map fieldMap = getAllFields(clazz); if (fieldMap == null) { return null; } Field field = fieldMap.get(fieldName); if (field != null) { return field.getType(); } } catch (Exception e) { logger.error("指定的Field:" + fieldName + "不存在,请检查", e); } return null; } /** * 获取指定的annotation * * @param scope annotation所在的位置 * @param memberName 成员名称,指定获取指定成员的Annotation实例 */ private static Annotation getAnnotation(Class clazz, ElementType scope, String memberName, Class annoClass) { try { if (ElementType.FIELD == scope) { Field field = clazz.getDeclaredField(memberName); setAccessible(field); return field.getAnnotation(annoClass); } else if (ElementType.METHOD == scope) { Method method = clazz.getDeclaredMethod(memberName); setAccessible(method); return method.getAnnotation(annoClass); } else if (ElementType.TYPE == scope) { return clazz.getAnnotation(annoClass); } } catch (Exception e) { logger.error("获取annotation出现异常", e); } return null; } /** * 获取指定的annotation * * @param scope annotation所在的位置 * @param memberName 成员名称,指定获取指定成员的Annotation实例 */ private static List getAnnotations(Class clazz, ElementType scope, String memberName) { try { if (ElementType.FIELD == scope) { Field field = clazz.getDeclaredField(memberName); setAccessible(field); return Arrays.asList(field.getDeclaredAnnotations()); } else if (ElementType.METHOD == scope) { Method method = clazz.getDeclaredMethod(memberName); setAccessible(method); return Arrays.asList(method.getDeclaredAnnotations()); } else if (ElementType.TYPE == scope) { return Arrays.asList(clazz.getDeclaredAnnotations()); } } catch (Exception e) { logger.error("获取annotation出现异常", e); } return Collections.emptyList(); } /** * 获取Field指定的annotation */ public static Annotation getFieldAnnotation(Class clazz, String fieldName, Class annoClass) { return getAnnotation(clazz, ElementType.FIELD, fieldName, annoClass); } /** * 获取Field所有annotation */ public static List getFieldAnnotations(Class clazz, String fieldName) { return getAnnotations(clazz, ElementType.FIELD, fieldName); } /** * 获取Method指定的annotation */ public static Annotation getMethodAnnotation(Class clazz, String methodName, Class annoClass) { return getAnnotation(clazz, ElementType.METHOD, methodName, annoClass); } /** * 获取Method所有annotation */ public static List getMethodAnnotations(Class clazz, String methodName) { return getAnnotations(clazz, ElementType.METHOD, methodName); } /** * 获取类指定annotation */ public static Annotation getClassAnnotation(Class clazz, Class annoClass) { return getAnnotation(clazz, ElementType.TYPE, clazz.getName(), annoClass); } /** * 获取类所有annotation */ public static List getClassAnnotations(Class clazz) { return getAnnotations(clazz, ElementType.TYPE, clazz.getName()); } /** * 根据注解调用对应的方法 * @param target * 目标对象 * @param annotationClass * 注解类型 * @param args * 方法反射调用传参 */ public static void invokeAnnoMethod(Object target, Class annotationClass, Object ...args) throws Exception { if (target == null || annotationClass == null) { return; } try { for (Method method : getAllMethods(target.getClass()).values()) { if (method.isAnnotationPresent(annotationClass)) { method.invoke(target, args); } } } catch (Exception e) { logger.error("反射调用方法失败,请检查:" + target.getClass().getName()); throw e; } } /** * 根据注解调用步骤方法 * @param target * 目标对象 * @param annotations * 注解类型列表 */ public static void invokeStepAnnoMethod(Object target, Class ... annotations) throws Exception { if (target == null || annotations == null || annotations.length == 0) { return; } long successCount = 0, failedCount = 0, begin = System.currentTimeMillis(); try { Collection methods = getAllMethods(target.getClass()).values(); for (Class annotationClass : annotations) { for (Method method : methods) { // 避免因为将注解标注到process方法上导致process执行多次 if (!"process".equals(method.getName()) && method.isAnnotationPresent(annotationClass)) { Annotation anno = method.getAnnotation(annotationClass); Object retVal = getAnnoFieldValue(anno, "value"); String desc = retVal == null ? "" : retVal.toString(); if (StringUtils.isBlank(desc)) { desc = "开始执行"; } String step = annotationClass.getSimpleName(); logger.warn(FirePS1Conf.GREEN() + " " + step + ". " + desc + " " + FirePS1Conf.DEFAULT()); long start = System.currentTimeMillis(); Object skipError = getAnnoFieldValue(anno, "skipError"); try { method.invoke(target, null); successCount += 1; } catch (Exception e) { long end = System.currentTimeMillis(); logger.error(FirePS1Conf.RED() + " " + step + ". 执行报错!耗时:"+ (readable(end - start, UnitFormatUtils.TimeUnitEnum.MS)) + " " + FirePS1Conf.DEFAULT() + "\n", e); boolean isSkip = Boolean.parseBoolean(skipError.toString()); failedCount += 1; if (!isSkip) { throw e; } } long end = System.currentTimeMillis(); logger.warn(FirePS1Conf.GREEN() + " " + step + ". 执行耗时:" + (readable(end - start, UnitFormatUtils.TimeUnitEnum.MS)) + " " + FirePS1Conf.DEFAULT() + "\n"); } } } long finalEnd = System.currentTimeMillis(); long allCount = successCount + failedCount; if (allCount > 0) { logger.warn(FirePS1Conf.GREEN() + " Finished. 总计:" + allCount + "个 成功:" + successCount + "个 失败:" + failedCount + "个, 执行耗时:" + (readable(finalEnd - begin, UnitFormatUtils.TimeUnitEnum.MS)) + " " + FirePS1Conf.DEFAULT() + "\n"); } } catch (Exception e) { logger.error("反射调用方法失败,请检查:" + target.getClass().getName()); throw e; } } /** * 获取指定Annotation的字段配置值 * @param anno * 具体的注解类 * @param methodName * 注解的field */ public static Object getAnnoFieldValue(Annotation anno, String methodName) throws Exception { Method[] methods = anno.getClass().getMethods(); Object retVal = null; for (Method method : methods) { if (method.getName().equalsIgnoreCase(methodName)) { retVal = method.invoke(anno, null); } } return retVal; } /** * 获取指定类所在的jar包 */ public static String getClassInJar(Class clazz) { return clazz.getProtectionDomain().getCodeSource().getLocation().getFile(); } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/util/StringsUtils.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util; import org.apache.commons.lang3.StringUtils; import java.util.Map; import java.util.Objects; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 字符串工具类 * * @author ChengLong 2019-4-11 09:06:26 */ public class StringsUtils { private StringsUtils() { } /** * 处理成超链接 * * @param str * @return */ public static String hrefTag(String str) { return append("", str, ""); } /** * 追加换行 * * @param str * @return */ public static String brTag(String str) { return append(str, "
"); } /** * 字符串拼接 * * @param strs 多个字符串 * @return 拼接结果 */ public static String append(String... strs) { StringBuilder sb = new StringBuilder(); if (null != strs && strs.length > 0) { for (String str : strs) { sb.append(str); } } return sb.toString(); } /** * replace多组字符串中的数据 * * @param map * @return * @apiNote replace(str, ImmutableMap.of ( " # ", " ", ", ", " ")) */ public static String replace(String str, Map map) { if (StringUtils.isNotBlank(str) && null != map && map.size() > 0) { for (Map.Entry entry : map.entrySet()) { str = str.replace(entry.getKey(), entry.getValue()); } } return str; } /** * 16进制的字符串表示转成字节数组 * * @param hexString 16进制格式的字符串 * @return 转换后的字节数组 **/ public static byte[] toByteArray(String hexString) { if (StringUtils.isEmpty(hexString)) throw new IllegalArgumentException("this hexString must not be empty"); hexString = hexString.toLowerCase(); final byte[] byteArray = new byte[hexString.length() / 2]; int k = 0; for (int i = 0; i < byteArray.length; i++) {//因为是16进制,最多只会占用4位,转换成字节需要两个16进制的字符,高位在先 byte high = (byte) (Character.digit(hexString.charAt(k), 16) & 0xff); byte low = (byte) (Character.digit(hexString.charAt(k + 1), 16) & 0xff); byteArray[i] = (byte) (high << 4 | low); k += 2; } return byteArray; } /** * 字节数组转成16进制表示格式的字符串 * * @param byteArray 需要转换的字节数组 * @return 16进制表示格式的字符串 **/ public static String toHexString(byte[] byteArray) { if (byteArray == null || byteArray.length < 1) throw new IllegalArgumentException("this byteArray must not be null or empty"); final StringBuilder hexString = new StringBuilder(); for (int i = 0; i < byteArray.length; i++) { if ((byteArray[i] & 0xff) < 0x10)//0~F前面不零 hexString.append('0'); hexString.append(Integer.toHexString(0xFF & byteArray[i])); } return hexString.toString().toLowerCase(); } /** * 具有容错功能的substring,如果下标越界,则默认取到尾部 * * @param str 原字符串 * @param start 索引起始 * @param end 索引结束 * @return 截取后的子字符串 */ public static String substring(String str, int start, int end) { if (StringUtils.isBlank(str) || Math.abs(start) > Math.abs(end)) { return ""; } int length = str.length(); if (length >= Math.abs(end)) { return str.substring(Math.abs(start), Math.abs(end)); } else { return str.substring(Math.abs(start), Math.abs(length)); } } /** * 判断一个字符串是否为整型 * 1. 包号空字符串的不能看作是整数 * 2. 超过Int最大值的不能作为整数 */ public static boolean isInt(String str) { if (StringUtils.isBlank(str)) return false; try { Integer.parseInt(str); return true; } catch (Exception e) { // 如果超过精度,则不能看做是整型 return false; } } /** * 判断字符串是否为整数(前面是数值类型,最后是L或l结尾,也认为是长整数) */ public static boolean isLong(String str) { if (StringUtils.isBlank(str)) return false; str = str.toUpperCase(); if (str.endsWith("L")) { try { Long.parseLong(str.replace("L", "")); return true; } catch (Exception e) { return false; } } return false; } /** * 用于判断字符串是否为布尔类型 */ public static boolean isBoolean(String str) { if (StringUtils.isBlank(str)) return false; return "true".equalsIgnoreCase(str) || "false".equalsIgnoreCase(str); } /** * 用于判断字符串是否为float类型 * 以字母F或f结尾的合法数值型字符串认为是float类型 */ public static boolean isFloat(String str) { if (StringUtils.isBlank(str)) return false; str = str.toUpperCase(); if (str.endsWith("F")) { try { Float.parseFloat(str.replace("F", "")); return true; } catch (Exception e) { return false; } } return false; } /** * 用于判断字符串是否为float类型 * 以字母F或f结尾的合法数值型字符串认为是float类型 */ public static boolean isDouble(String str) { if (StringUtils.isBlank(str)) return false; str = str.toUpperCase(); if (str.endsWith("D")) { try { Double.parseDouble(str.replace("D", "")); return true; } catch (Exception e) { return false; } } return false; } /** * 根据字符串具体的类型进行转换,返回转换类型之后的数据 */ public static Object parseString(String str) { if (StringsUtils.isLong(str)) { String longStr = str.toUpperCase().replace("L", ""); return Long.valueOf(longStr); } else if (StringsUtils.isInt(str)) { return Integer.valueOf(str); } else if (StringsUtils.isBoolean(str)) { return Boolean.valueOf(str); } else if (StringsUtils.isFloat(str)) { String floatStr = str.toUpperCase().replace("F", ""); return Float.valueOf(floatStr); } else if (StringsUtils.isDouble(str)) { String doubleStr = str.toUpperCase().replace("D", ""); return Double.valueOf(doubleStr); } else { return str; } } /** * 用于判断给定的字符串是否为数值类型,负数、小数均认为是数值类型 * @param str * 字符串 * @return * true:数值类型 false:非数值类型 */ public static boolean isNumeric(String str) { Pattern pattern = Pattern.compile("(^\\-?[1-9]\\d*\\.?\\d*$)|(^\\-?0\\.\\d*[1-9]$)"); Matcher matcher = pattern.matcher(str); return matcher.matches(); } /** * 基于时间戳的随机算法从字符串列表中获取随机的字符串 * * @param strs * 被随机分隔的一组字符串 * @param delimiter * 分隔符 * @return * 随机的字符串 */ public static String randomSplit(String strs, String delimiter) { if (StringUtils.isBlank(strs)) throw new IllegalArgumentException("Hive Thrift Server url不能为空!"); if (StringUtils.isBlank(delimiter)) delimiter = ","; String[] metastores = strs.split(delimiter); if (metastores.length == 0) throw new IllegalArgumentException("未能根据指定的分隔符[" + delimiter + "]分隔字符串:" + strs); return StringUtils.trim(metastores[(int) (System.currentTimeMillis() % metastores.length)]); } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/util/UnitFormatUtils.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util; import java.math.BigDecimal; import java.math.RoundingMode; import java.util.Arrays; import java.util.LinkedList; import java.util.List; /** * 通用的计量单位转换工具 * * @author ChengLong 2019年9月29日 18:05:56 */ public class UnitFormatUtils { /** * 磁盘数据单位体系中的单位的枚举 */ public enum DateUnitEnum { // 数据类型的内容 BYTE, KB, MB, GB, TB, PB, EB; // 对数据类型进行排序 private static List orderList = Arrays.asList(BYTE, KB, MB, GB, TB, PB, EB); // 定义计量单位换算关系 private static List metric = init(1024, 1024, 1024, 1024, 1024, 1024, 1); } /** * 时间单位体系中的单位的枚举 */ public enum TimeUnitEnum { // 数据类型的内容 US, MS, S, MIN, H, D; // 对数据类型进行排序 private static List orderList = Arrays.asList(US, MS, S, MIN, H, D); // 定义计量单位换算关系 private static List metric = init(1000, 1000, 60, 60, 24, 1); } /** * 获取当前单位在list中的索引值 * * @param unit 要查询的单位 * @return 索引值 */ private static int getIndex(List orderList, T unit) { for (int i = 0; i < orderList.size(); i++) { if (orderList.get(i) == unit) { return i; } } return 0; } /** * 初始化计量单位列表 */ private static List init(int ... metrics) { List list = new LinkedList<>(); for (int metric : metrics) { list.add(new BigDecimal(metric)); } return list; } /** * 将传入磁盘数据的大数/小数等等转换为易读的形式 * 易读的标准是可以展示为某一单位区间内的大于1的数,自动取两位小数 * * @param data 传入的初始数值 * @param unit 传入数值的单位 * @return 转换过后的易读字符串,带单位 */ public static String readable(Number data, DateUnitEnum unit) { BigDecimal data1 = new BigDecimal(data.toString()); // 获取初始参数的索引值 int index = getIndex(DateUnitEnum.orderList, unit); // 判定传入参数在当前单位下,是否超出其数值区间 if (data.longValue() < DateUnitEnum.metric.get(DateUnitEnum.orderList.indexOf(unit)).longValue() || unit == DateUnitEnum.orderList.get(DateUnitEnum.orderList.size() - 1)) { // 判定传入的数值是否小于1,如果小于1,则进入 if (data.longValue() < 1 && unit != DateUnitEnum.orderList.get(0)) { // 对小于1的参数进行放大,向上进一位:数值放大相应进制,进制下调一位 return readable(data1.multiply(DateUnitEnum.metric.get(index - 1)), DateUnitEnum.orderList.get(index - 1)); } // 如果是本单位区间的大于1的值,进行返回处理 return data1.divide(new BigDecimal(1), 2, RoundingMode.HALF_UP) + unit.toString(); } // 超出了当前单位的取值范围 else { // 对数值升位:数值除以相应的进制,单位上调一位 return readable(data1.divide(DateUnitEnum.metric.get(index), 2, RoundingMode.HALF_UP), DateUnitEnum.orderList.get(index + 1)); } } /** * 将磁盘数据大小从一种单位转换为传入的单位 * * @param data 输入的初始参数 * @param fromUnit 输入的初始参数的单位 * @param toUnit 要转换的目标单位 */ public static String format(Number data, DateUnitEnum fromUnit, DateUnitEnum toUnit) { BigDecimal data1 = new BigDecimal(data.toString()); // 获取初始参数的索引值 int index = getIndex(DateUnitEnum.orderList, fromUnit); // 判别初始参数索引是否高于目标参数索引 if (DateUnitEnum.orderList.indexOf(fromUnit) > DateUnitEnum.orderList.indexOf(toUnit)) { // 递归调用方法,对参数放大相应进制倍数,将单位下调一位 return format(data1.multiply(DateUnitEnum.metric.get(index - 1)), DateUnitEnum.orderList.get(index - 1), toUnit); // 判别初始参数索引是否低于目标参数索引 } else if (DateUnitEnum.orderList.indexOf(fromUnit) < DateUnitEnum.orderList.indexOf(toUnit)) { // 递归调用方法,对参数缩小相应进制倍数,将单位上调一位 return format(data1.divide(DateUnitEnum.metric.get(index), 2, RoundingMode.HALF_UP), DateUnitEnum.orderList.get(index + 1), toUnit); // 取得fromUnit与toUnit的索引值相同的情况 } else { // 进行数据处理,返回相应结果 return data1.divide(new BigDecimal(1), 2, RoundingMode.HALF_UP) + fromUnit.toString(); } } /** * 将传入时间的大数/小数等等转换为易读的形式 * 易读的标准是可以展示为某一单位区间内的大于1的数,自动取两位小数 * * @param data 传入的初始数值 * @param unit 传入数值的单位 * @return 转换过后的易读字符串,带单位 */ public static String readable(Number data, TimeUnitEnum unit) { BigDecimal data1 = new BigDecimal(data.toString()); // 获取初始参数的索引值 int index = getIndex(TimeUnitEnum.orderList, unit); // 判定传入参数在当前单位下,是否超出其数值区间 if (data.longValue() < TimeUnitEnum.metric.get(TimeUnitEnum.orderList.indexOf(unit)).longValue() || unit == TimeUnitEnum.orderList.get(TimeUnitEnum.orderList.size() - 1)) { // 判定传入的数值是否小于1,如果小于1,则进入 if (data.longValue() < 1 && unit != TimeUnitEnum.orderList.get(0)) { // 对小于1的参数进行放大,向上进一位:数值放大相应进制,进制下调一位 return readable(data1.multiply(TimeUnitEnum.metric.get(index - 1)), TimeUnitEnum.orderList.get(index - 1)); } // 如果是本单位区间的大于1的值,进行返回处理 return data1.divide(new BigDecimal(1), 2, RoundingMode.HALF_UP) + unit.toString().toLowerCase(); } // 超出了当前单位的取值范围 else { // 对数值升位:数值除以相应的进制,单位上调一位 return readable(data1.divide(TimeUnitEnum.metric.get(index), 2, RoundingMode.HALF_UP), TimeUnitEnum.orderList.get(index + 1)); } } /** * 将时间从一种单位转换为传入的单位 * * @param data 输入的初始参数 * @param fromUnit 输入的初始参数的单位 * @param toUnit 要转换的目标单位 */ public static String format(Number data, TimeUnitEnum fromUnit, TimeUnitEnum toUnit) { BigDecimal data1 = new BigDecimal(data.toString()); // 获取初始参数的索引值 int index = getIndex(TimeUnitEnum.orderList, fromUnit); // 判别初始参数索引是否高于目标参数索引 if (TimeUnitEnum.orderList.indexOf(fromUnit) > TimeUnitEnum.orderList.indexOf(toUnit)) { // 递归调用方法,对参数放大相应进制倍数,将单位下调一位 return format(data1.multiply(TimeUnitEnum.metric.get(index - 1)), TimeUnitEnum.orderList.get(index - 1), toUnit); // 判别初始参数索引是否低于目标参数索引 } else if (TimeUnitEnum.orderList.indexOf(fromUnit) < TimeUnitEnum.orderList.indexOf(toUnit)) { // 递归调用方法,对参数缩小相应进制倍数,将单位上调一位 return format(data1.divide(TimeUnitEnum.metric.get(index), 2, RoundingMode.HALF_UP), TimeUnitEnum.orderList.get(index + 1), toUnit); // 取得fromUnit与toUnit的索引值相同的情况 } else { // 进行数据处理,返回相应结果 return data1.divide(new BigDecimal(1), 2, RoundingMode.HALF_UP) + fromUnit.toString(); } } } ================================================ FILE: fire-common/src/main/java/com/zto/fire/common/util/YarnUtils.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Yarn相关工具类 * @author ChengLong 2018年8月10日 16:03:29 */ public class YarnUtils { private YarnUtils() {} /** * 使用正则提取日志中的applicationId * @param log * @return */ public static String getAppId(String log) { // 正则表达式规则 String regEx = "application_[0-9]+_[0-9]+"; // 编译正则表达式 Pattern pattern = Pattern.compile(regEx); // 忽略大小写的写法 Matcher matcher = pattern.matcher(log); // 查找字符串中是否有匹配正则表达式的字符/字符串 if(matcher.find()) { return matcher.group(); } else { return ""; } } } ================================================ FILE: fire-common/src/main/resources/log4j.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # log4j.rootLogger = INFO, stdout, D ### \u8F93\u51FA\u5230\u63A7\u5236\u53F0 ### log4j.appender.stdout = org.apache.log4j.ConsoleAppender log4j.appender.stdout.Target = System.out log4j.appender.stdout.layout = org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss.SSS} [%thread]-[%p]-[%c] %m%n ### \u8F93\u51FA\u5230\u65E5\u5FD7\u6587\u4EF6 ### log4j.appender.D = org.apache.log4j.DailyRollingFileAppender log4j.appender.D.File = ./logs/fire.log log4j.appender.D.Append = true log4j.appender.D.Threshold = INFO log4j.appender.D.layout = org.apache.log4j.PatternLayout log4j.appender.D.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss.SSS} [%thread]-[%p]-[%c]-[%l] %m%n ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/bean/TableIdentifier.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.bean import com.zto.fire.predef._ /** * 用于标识表的信息 * * @author ChengLong 2022-09-06 15:19:55 * @since 2.3.2 */ case class TableIdentifier(private val _table: String, private val _database: String = "") { lazy val table = { if (this._table.contains(".")) { this._table.split('.')(1) } else this._table } lazy val database = { if (isEmpty(this._database) && this._table.contains(".")) { this._table.split('.')(0) } else this._database } /** * 用于判断是否存在数据库名称 * 如果将库名直接写到表名中,也认为库存在 */ def existsDB: Boolean = noEmpty(this.database) || table.contains(".") def notExistsDB: Boolean = !this.existsDB /** * 获取库表描述信息 */ def identifier: String = this.toString override def toString: JString = { if (noEmpty(database)) s"$database.$table".trim else table.trim } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/conf/FireConf.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.conf import com.zto.fire.common.util.PropUtils /** * 常量配置类 * @author ChengLong * @since 1.1.0 * @create 2020-07-13 15:00 */ private[fire] class FireConf { // 用于区分不同的流计算引擎类型 private[fire] lazy val engine = PropUtils.engine // Fire框架相关配置 val frameworkConf = FireFrameworkConf // kafka相关配置 val kafkaConf = FireKafkaConf // rocketMQ相关配置 val rocketMQConf = FireRocketMQConf // 颜色预定义 val ps1Conf = FirePS1Conf // hive相关配置 val hiveConf = FireHiveConf } object FireConf extends FireConf ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/conf/FireFrameworkConf.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.conf import com.zto.fire.common.util.{DateFormatUtils, PropUtils} import org.apache.commons.lang3.StringUtils /** * Fire框架相关配置 * * @author ChengLong * @since 1.1.0 * @create 2020-07-13 14:54 */ private[fire] object FireFrameworkConf { // fire版本号 lazy val FIRE_VERSION = "fire.version" lazy val DRIVER_CLASS_NAME = "driver.class.name" // fire内置线程池大小 lazy val FIRE_THREAD_POOL_SIZE = "fire.thread.pool.size" // fire内置定时任务线程池大小 lazy val FIRE_THREAD_POOL_SCHEDULE_SIZE = "fire.thread.pool.schedule.size" // 是否启用fire框架restful服务 lazy val FIRE_REST_ENABLE = "fire.rest.enable" lazy val FIRE_REST_URL_HOSTNAME = "fire.rest.url.hostname" lazy val FIRE_CONF_DEPLOY_ENGINE = "fire.conf.deploy.engine" lazy val FIRE_ENGINE_CONF_HELPER = "com.zto.fire.core.sync.SyncEngineConfHelper" // rest接口权限认证 lazy val FIRE_REST_FILTER_ENABLE = "fire.rest.filter.enable" // 用于配置是否关闭fire内置的所有累加器 lazy val FIRE_ACC_ENABLE = "fire.acc.enable" // 日志累加器开关 lazy val FIRE_ACC_LOG_ENABLE = "fire.acc.log.enable" // 多值累加器开关 lazy val FIRE_ACC_MULTI_COUNTER_ENABLE = "fire.acc.multi.counter.enable" // 多时间维度累加器开关 lazy val FIRE_ACC_MULTI_TIMER_ENABLE = "fire.acc.multi.timer.enable" // env累加器开关 lazy val FIRE_ACC_ENV_ENABLE = "fire.acc.env.enable" // fire框架埋点日志开关,当关闭后,埋点的日志将不再被记录到日志累加器中,并且也不再打印 lazy val FIRE_LOG_ENABLE = "fire.log.enable" // 用于限定fire框架中sql日志的字符串长度 lazy val FIRE_LOG_SQL_LENGTH = "fire.log.sql.length" // fire框架rest接口服务最大线程数 lazy val FIRE_RESTFUL_MAX_THREAD = "fire.restful.max.thread" lazy val FIRE_CONNECTOR_SHUTDOWN_HOOK_ENABLE = "fire.connector.shutdown_hook.enable" // 用于配置是否抛弃配置中心独立运行 lazy val FIRE_CONFIG_CENTER_ENABLE = "fire.config_center.enable" // 本地运行环境下(Windows、Mac)是否调用配置中心接口获取配置信息 lazy val FIRE_CONFIG_CENTER_LOCAL_ENABLE = "fire.config_center.local.enable" // 配置中心接口调用秘钥 lazy val FIRE_CONFIG_CENTER_SECRET = "fire.config_center.register.conf.secret" // fire框架restful端口冲突重试次数 lazy val FIRE_RESTFUL_PORT_RETRY_NUM = "fire.restful.port.retry_num" // fire框架restful端口冲突重试时间(ms) lazy val FIRE_RESTFUL_PORT_RETRY_DURATION = "fire.restful.port.retry_duration" lazy val FIRE_REST_SERVER_SECRET = "fire.rest.server.secret" lazy val FIRE_LOG_LEVEL_CONF_PREFIX = "fire.log.level.conf." lazy val FIRE_ARTHAS_CONF_PREFIX = "fire.analysis.arthas.conf." lazy val FIRE_USER_COMMON_CONF = "fire.user.common.conf" // 日志记录器保留最少的记录数 lazy val FIRE_ACC_LOG_MIN_SIZE = "fire.acc.log.min.size" // 日志记录器保留最多的记录数 lazy val FIRE_ACC_LOG_MAX_SIZE = "fire.acc.log.max.size" // env累加器保留最多的记录数 lazy val FIRE_ACC_ENV_MAX_SIZE = "fire.acc.env.max.size" // env累加器保留最少的记录数 lazy val FIRE_ACC_ENV_MIN_SIZE = "fire.acc.env.min.size" // timer累加器保留最大的记录数 lazy val FIRE_ACC_TIMER_MAX_SIZE = "fire.acc.timer.max.size" // timer累加器清理几小时之前的记录 lazy val FIRE_ACC_TIMER_MAX_HOUR = "fire.acc.timer.max.hour" // 定时调度任务黑名单(定时任务方法名),以逗号分隔 lazy val FIRE_SCHEDULER_BLACKLIST = "fire.scheduler.blacklist" // 用于配置是否启用任务定时调度 lazy val FIRE_TASK_SCHEDULE_ENABLE = "fire.task.schedule.enable" // quartz最大线程池大小 lazy val FIRE_QUARTZ_MAX_THREAD = "fire.quartz.max.thread" // fire框架restful地址 lazy val FIRE_REST_URL = "fire.rest.url" lazy val FIRE_SHUTDOWN_EXIT = "fire.shutdown.auto.exit" // print记录数限制 lazy val FIRE_PRINT_LIMIT = "fire.print.limit" lazy val FIRE_HIVE_METASTORE_URL_RANDOM_ENABLE = "fire.hive.metastore.url.random.enable" // 配置中心生产环境注册地址 lazy val FIRE_CONFIG_CENTER_REGISTER_CONF_PROD_ADDRESS = "fire.config_center.register.conf.prod.address" // 配置中心测试环境注册地址 lazy val FIRE_CONFIG_CENTER_REGISTER_CONF_TEST_ADDRESS = "fire.config_center.register.conf.test.address" // 配置打印黑名单,配置项以逗号分隔 lazy val FIRE_CONF_PRINT_BLACKLIST = "fire.conf.print.blacklist" // 是否启用动态配置功能 lazy val FIRE_DYNAMIC_CONF_ENABLE = "fire.dynamic.conf.enable" // 是否打印配置信息 lazy val FIRE_CONF_SHOW_ENABLE = "fire.conf.show.enable" // 是否将fire restful地址以日志形式打印 lazy val FIRE_REST_URL_SHOW_ENABLE = "fire.rest.url.show.enable" lazy val SPARK_STREAMING_CONF_FILE = "spark-streaming" lazy val SPARK_STRUCTURED_STREAMING_CONF_FILE = "structured-streaming" lazy val SPARK_CORE_CONF_FILE = "spark-core" lazy val FLINK_CONF_FILE = "flink" lazy val FLINK_STREAMING_CONF_FILE = "flink-streaming" lazy val FLINK_BATCH_CONF_FILE = "flink-batch" lazy val FIRE_DEPLOY_CONF_ENABLE = "fire.deploy_conf.enable" lazy val FIRE_EXCEPTION_BUS_SIZE = "fire.exception_bus.size" lazy val FIRE_LINEAGE_ENABLE = "fire.lineage.enable" lazy val FIRE_LINEAGE_RUN_COUNT = "fire.lineage.run.count" lazy val FIRE_LINEAGE_MAX_SIZE = "fire.lineage.max.size" lazy val FIRE_LINEAGE_RUN_INITIAL_DELAY = "fire.lineage.run.initialDelay" lazy val FIRE_LINEAGE_RUN_PERIOD = "fire.lineage.run.period" lazy val FIRE_LINEAGE_DATASOURCE_MAP = "fire.lineage.datasource.map." lazy val FIRE_LINEAGE_SEND_MQ_ENABLE = "fire.lineage.send.mq.enable" lazy val FIRE_LINEAGE_SEND_MQ_URL = "fire.lineage.send.mq.url" lazy val FIRE_LINEAGE_SEND_MQ_TOPIC = "fire.lineage.send.mq.topic" lazy val FIRE_CONF_ADAPTIVE_PREFIX = "fire.conf.adaptive.prefix" lazy val FIRE_ANALYSIS_ARTHAS_ENABLE = "fire.analysis.arthas.enable" lazy val FIRE_ANALYSIS_ARTHAS_CONTAINER_ENABLE = "fire.analysis.arthas.container.enable" lazy val FIRE_ANALYSIS_ARTHAS_TUNNEL_SERVER_URL = "fire.analysis.arthas.tunnel_server.url" lazy val FIRE_ARTHAS_LAUNCHER = "fire.analysis.arthas.launcher" lazy val FIRE_ENV_LOCAL = "fire.env.local" lazy val FIRE_CONF_ANNO_MANAGER_CLASS = "fire.conf.anno.manager.class" lazy val FIRE_CONF_ANNOTATION = "fire.conf.annotation.enable" lazy val FIRE_ANALYSIS_LOG_EXCEPTION_STACK_ENABLE = "fire.analysis.log.exception.stack.enable" lazy val FIRE_ANALYSIS_LOG_EXCEPTION_SEND_MAX_RETIRES = "fire.analysis.log.exception.send.maxRetires" lazy val FIRE_ANALYSIS_LOG_EXCEPTION_SEND_TIMEOUT = "fire.analysis.log.exception.send.timeout" lazy val FIRE_ANALYSIS_LOG_EXCEPTION_SEND_MQ_URL = "fire.analysis.log.exception.send.mq.url" lazy val FIRE_ANALYSIS_LOG_EXCEPTION_SEND_MQ_TOPIC = "fire.analysis.log.exception.send.mq.topic" lazy val FIRE_JOB_AUTO_START = "fire.job.autoStart" lazy val FIRE_ACC_SYNC_MAX_SIZE = "fire.acc.sync.max.size" /** * 用于jdbc url的识别,当无法通过driver class识别数据源时,将从url中的端口号进行区分 * 不同数据配置使用统一的前缀:fire.lineage.datasource.map. */ def lineageDatasourceMap: Map[String, String] = PropUtils.sliceKeys(this.FIRE_LINEAGE_DATASOURCE_MAP) // 获取当前任务的rest server访问地址 lazy val fireRestUrl = PropUtils.getString(this.FIRE_REST_URL, "") // 是否启用hostname作为fire rest url lazy val restUrlHostname = PropUtils.getBoolean(this.FIRE_REST_URL_HOSTNAME, false) // 不同引擎配置获取具体的实现 lazy val confDeployEngine = PropUtils.getString(this.FIRE_CONF_DEPLOY_ENGINE, "") // 定时解析埋点SQL的执行频率(s) lazy val lineageRunPeriod = PropUtils.getInt(this.FIRE_LINEAGE_RUN_PERIOD, 60) // 定时解析埋点SQL的初始延迟(s) lazy val lineageRunInitialDelay = PropUtils.getInt(this.FIRE_LINEAGE_RUN_INITIAL_DELAY, 60) // 用于存放埋点的队列最大大小,超过该大小将会被丢弃 lazy val lineMaxSize = PropUtils.getInt(this.FIRE_LINEAGE_MAX_SIZE, 200) // 异步解析血缘线程执行的次数 lazy val lineageRunCount = PropUtils.getInt(this.FIRE_LINEAGE_RUN_COUNT, 10) // 是否开启实时血缘埋点 lazy val lineageEnable = PropUtils.getBoolean(this.FIRE_LINEAGE_ENABLE, true) lazy val lineageSendMqEnable = PropUtils.getBoolean(this.FIRE_LINEAGE_SEND_MQ_ENABLE, false) lazy val lineageMQUrl = { val url = PropUtils.getString(this.FIRE_LINEAGE_SEND_MQ_URL, "") FireKafkaConf.kafkaBrokers(url) } lazy val lineageTopic = PropUtils.getString(this.FIRE_LINEAGE_SEND_MQ_TOPIC) // 每个jvm实例内部queue用于存放异常对象数最大大小,避免队列过大造成内存溢出 lazy val exceptionBusSize = PropUtils.getInt(this.FIRE_EXCEPTION_BUS_SIZE, 1000) // 是否将配置同步到executor、taskmanager端 lazy val deployConf = PropUtils.getBoolean(this.FIRE_DEPLOY_CONF_ENABLE, true) // fire内置线程池大小 lazy val threadPoolSize = PropUtils.getInt(this.FIRE_THREAD_POOL_SIZE, 5) // fire内置定时任务线程池大小 lazy val threadPoolSchedulerSize = PropUtils.getInt(this.FIRE_THREAD_POOL_SCHEDULE_SIZE, 5) // 自适应前缀,调用getOriginalProperty避免栈溢出 lazy val adaptivePrefix = PropUtils.getOriginalProperty(this.FIRE_CONF_ADAPTIVE_PREFIX).toBoolean // 用户公共配置文件列表 lazy val userCommonConf = PropUtils.getString(this.FIRE_USER_COMMON_CONF, "").split(",").map(conf => conf.trim).toList // fire接口认证秘钥 lazy val restServerSecret = PropUtils.getString(this.FIRE_REST_SERVER_SECRET) // 用于配置是否在调用shutdown后主动退出jvm进程 lazy val shutdownExit = PropUtils.getBoolean(this.FIRE_SHUTDOWN_EXIT, false) // 是否启用为connector注册shutdown hook,当jvm退出前close lazy val connectorShutdownHookEnable = PropUtils.getBoolean(this.FIRE_CONNECTOR_SHUTDOWN_HOOK_ENABLE, false) // 用于指定当前运行环境是否为local模式(主要用于flink-shell的本地配置文件加载) lazy val localEnv = PropUtils.getBoolean(this.FIRE_ENV_LOCAL, false) // fire日志打印黑名单 lazy val fireConfBlackList: Set[String] = { val blacklist = PropUtils.getString(this.FIRE_CONF_PRINT_BLACKLIST, "") if (StringUtils.isNotBlank(blacklist)) blacklist.split(",").toSet else Set.empty } // 获取driver的class name lazy val driverClassName = PropUtils.getString(this.DRIVER_CLASS_NAME) // 是否打印配置信息 lazy val fireConfShow: Boolean = PropUtils.getBoolean(this.FIRE_CONF_SHOW_ENABLE, false) // 是否将restful地址以日志方式打印 lazy val fireRestUrlShow: Boolean = PropUtils.getBoolean(this.FIRE_REST_URL_SHOW_ENABLE, false) // 获取动态配置参数 lazy val dynamicConf: Boolean = PropUtils.getBoolean(this.FIRE_DYNAMIC_CONF_ENABLE, true) // 用于获取fire版本号 lazy val fireVersion = PropUtils.getString(this.FIRE_VERSION, "1.0.0") // quartz最大线程池大小 lazy val quartzMaxThread = PropUtils.getString(this.FIRE_QUARTZ_MAX_THREAD, "8") // 用于设置是否启用任务定时调度 lazy val scheduleEnable = PropUtils.getBoolean(this.FIRE_TASK_SCHEDULE_ENABLE, true) // 定时任务黑名单,配置的value为方法名,多个以逗号分隔 def schedulerBlackList: String = PropUtils.getString(this.FIRE_SCHEDULER_BLACKLIST, "") // env累加器开关 lazy val accEnvEnable = PropUtils.getBoolean(this.FIRE_ACC_ENV_ENABLE, true) // 是否启用Fire内置的restful服务 lazy val restEnable = PropUtils.getBoolean(this.FIRE_REST_ENABLE, true) // rest接口权限认证 lazy val restFilter = PropUtils.getBoolean(this.FIRE_REST_FILTER_ENABLE, true) // 是否关闭fire内置的所有累加器 lazy val accEnable = PropUtils.getBoolean(this.FIRE_ACC_ENABLE, true) // 日志累加器开关 lazy val accLogEnable = PropUtils.getBoolean(this.FIRE_ACC_LOG_ENABLE, true) // 多值累加器开关 lazy val accMultiCounterEnable = PropUtils.getBoolean(this.FIRE_ACC_MULTI_COUNTER_ENABLE, true) // 多时间维度累加器开关 lazy val accMultiTimerEnable = PropUtils.getBoolean(this.FIRE_ACC_MULTI_TIMER_ENABLE, true) // fire框架埋点日志开关 lazy val logEnable = PropUtils.getBoolean(this.FIRE_LOG_ENABLE, true) // 用于限定fire框架中sql日志的字符串长度 lazy val logSqlLength = PropUtils.getInt(this.FIRE_LOG_SQL_LENGTH, 50) // 配置中心生产环境注册地址 lazy val configCenterProdAddress = PropUtils.getString(this.FIRE_CONFIG_CENTER_REGISTER_CONF_PROD_ADDRESS, "") // 配置中心测试环境注册地址 lazy val configCenterTestAddress = PropUtils.getString(this.FIRE_CONFIG_CENTER_REGISTER_CONF_TEST_ADDRESS) // fire框架rest接口服务最大线程数 lazy val restfulMaxThread = PropUtils.getInt(this.FIRE_RESTFUL_MAX_THREAD, 5) // 用于配置是否抛弃配置中心独立运行 lazy val configCenterEnable = PropUtils.getBoolean(this.FIRE_CONFIG_CENTER_ENABLE, true) // 本地运行环境下(Windows、Mac)是否调用配置中心接口获取配置信息 lazy val configCenterLocalEnable = PropUtils.getBoolean(this.FIRE_CONFIG_CENTER_LOCAL_ENABLE, false) // 配置中心接口调用秘钥 lazy val configCenterSecret = PropUtils.getString(this.FIRE_CONFIG_CENTER_SECRET, "") // fire框架restful端口冲突重试次数 lazy val restfulPortRetryNum = PropUtils.getInt(this.FIRE_RESTFUL_PORT_RETRY_NUM, 3) // fire框架restful端口冲突重试时间(ms) lazy val restfulPortRetryDuration = PropUtils.getLong(this.FIRE_RESTFUL_PORT_RETRY_DURATION, 1000L) // 用于限定日志最少保存量,防止当日志量达到maxLogSize时频繁的进行clear操作 lazy val minLogSize = PropUtils.getInt(this.FIRE_ACC_LOG_MIN_SIZE, 500).abs // 用于限定日志最大保存量,防止日志量过大,撑爆driver lazy val maxLogSize = PropUtils.getInt(this.FIRE_ACC_LOG_MAX_SIZE, 1000).abs // 用于限定运行时信息最少保存量,防止当运行时信息量达到maxEnvSize时频繁的进行clear操作 lazy val minEnvSize = PropUtils.getInt(this.FIRE_ACC_ENV_MIN_SIZE, 100).abs // 用于限定运行时信息最大保存量,防止过大撑爆driver lazy val maxEnvSize = PropUtils.getInt(this.FIRE_ACC_ENV_MAX_SIZE, 500).abs // 用于限定最大保存量,防止数据量过大,撑爆driver lazy val maxTimerSize = PropUtils.getInt(this.FIRE_ACC_TIMER_MAX_SIZE, 1000).abs // 用于指定清理指定小时数之前的记录 lazy val maxTimerHour = PropUtils.getInt(this.FIRE_ACC_TIMER_MAX_HOUR, 12).abs // print记录数限制 lazy val printLimit = PropUtils.getLong(this.FIRE_PRINT_LIMIT, 1000000) // 是否启用hive metastore url的随机选择 lazy val hiveMetastoreUrlRandomEnable = PropUtils.getBoolean(this.FIRE_HIVE_METASTORE_URL_RANDOM_ENABLE, true) // 是否启用arthas用于分析实时任务的性能 lazy val arthasEnable = PropUtils.getBoolean(this.FIRE_ANALYSIS_ARTHAS_ENABLE, false) && StringUtils.isNotBlank(this.arthasTunnelServerUrl) // 是否在container端启动arthas lazy val arthasContainerEnable = PropUtils.getBoolean(this.FIRE_ANALYSIS_ARTHAS_CONTAINER_ENABLE, false) // arthas tunnel服务的地址 lazy val arthasTunnelServerUrl = PropUtils.getString(this.FIRE_ANALYSIS_ARTHAS_TUNNEL_SERVER_URL) // arthas的参数 def arthasConfMap: Map[String, String] = PropUtils.sliceKeys(this.FIRE_ARTHAS_CONF_PREFIX) // 动态获取最新的secret def dynamicKey: String = this.restServerSecret + this.driverClassName + DateFormatUtils.formatCurrentDate // arthas启动器 lazy val arthasLauncher = PropUtils.getString(this.FIRE_ARTHAS_LAUNCHER) // 主键配置映射管理器子类实现 lazy val annoManagerClass = PropUtils.getString(this.FIRE_CONF_ANNO_MANAGER_CLASS) // 是否启用基于注解的方式进行配置 lazy val annoConfEnable = PropUtils.getBoolean(this.FIRE_CONF_ANNOTATION, true) // 是否启用异常堆栈采集 def exceptionTraceEnable: Boolean = PropUtils.getBoolean(this.FIRE_ANALYSIS_LOG_EXCEPTION_STACK_ENABLE, false) // 异常堆栈发送MQ失败最大重试次数 lazy val exceptionTraceSendMQMaxRetries = PropUtils.getInt(this.FIRE_ANALYSIS_LOG_EXCEPTION_SEND_MAX_RETIRES, 10) // 异常日志发送MQ超时时间 lazy val exceptionSendTimeout = PropUtils.getInt(this.FIRE_ANALYSIS_LOG_EXCEPTION_SEND_TIMEOUT, 3000) // 异常发送的mq的集群url def exceptionTraceMQ: String = { val url = PropUtils.getString(this.FIRE_ANALYSIS_LOG_EXCEPTION_SEND_MQ_URL, "") FireKafkaConf.kafkaBrokers(url) } // 异常发送到mq的哪个topic def exceptionTraceMQTopic: String = PropUtils.getString(this.FIRE_ANALYSIS_LOG_EXCEPTION_SEND_MQ_TOPIC, "") // 是否自动提交job lazy val jobAutoStart = PropUtils.getBoolean(this.FIRE_JOB_AUTO_START, true) lazy val accSyncMaxSize = PropUtils.getLong(this.FIRE_ACC_SYNC_MAX_SIZE, 100) } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/conf/FireHDFSConf.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.conf import com.zto.fire.common.util.PropUtils /** * HDFS配置 * * @author ChengLong * @since 1.1.0 * @create 2020-07-13 15:07 */ private[fire] object FireHDFSConf { // 是否启用高可用 lazy val HDFS_HA = "hdfs.ha.enable" lazy val HDFS_HA_PREFIX = "hdfs.ha.conf." // 配置是否启用hdfs HA lazy val hdfsHAEnable = PropUtils.getBoolean(this.HDFS_HA, true) /** * 读取HDFS高可用相关配置信息 */ def hdfsHAConf: Map[String, String] = { if (this.hdfsHAEnable) { PropUtils.sliceKeys(s"${this.HDFS_HA_PREFIX}${FireHiveConf.hiveCluster}.") } else Map.empty } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/conf/FireHiveConf.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.conf import com.zto.fire.common.util.{PropUtils, StringsUtils} import spark.utils.StringUtils /** * hive相关配置 * * @author ChengLong * @since 1.1.0 * @create 2020-07-13 15:02 */ private[fire] object FireHiveConf { lazy val HIVE_CLUSTER = "hive.cluster" // hive版本号 lazy val HIVE_VERSION = "hive.version" // hive的catalog名称 lazy val HIVE_CATALOG_NAME = "hive.catalog.name" lazy val HIVE_CLUSTER_MAP_PREFIX = "fire.hive.cluster.map." lazy val HIVE_SITE_PATH_MAP_PREFIX = "fire.hive.site.path.map." lazy val HIVE_CONF_PREFIX = "hive.conf." // 默认的库名 lazy val DEFAULT_DATABASE_NAME = "fire.hive.default.database.name" // 默认的数据库名称 lazy val dbName = "tmp" // 默认的分区名称 lazy val DEFAULT_TABLE_PARTITION_NAME = "fire.hive.table.default.partition.name" // 默认的partition名称 lazy val defaultPartitionName = "ds" // hive集群标识(batch/streaming/test) lazy val hiveCluster = PropUtils.getString(this.HIVE_CLUSTER, "") // 初始化hive集群名称与metastore映射 private lazy val hiveMetastoreMap = PropUtils.sliceKeys(this.HIVE_CLUSTER_MAP_PREFIX) // hive-site.xml存放路径映射 private lazy val hiveSiteMap = PropUtils.sliceKeys(this.HIVE_SITE_PATH_MAP_PREFIX) // hive版本号 lazy val hiveVersion = PropUtils.getString(this.HIVE_VERSION, "1.1.0") // hive catalog名称 lazy val hiveCatalogName = PropUtils.getString(this.HIVE_CATALOG_NAME, "hive") // hive的set配置,如:this.spark.sql("set hive.exec.dynamic.partition=true") lazy val hiveConfMap = PropUtils.sliceKeys(this.HIVE_CONF_PREFIX) lazy val defaultDB = PropUtils.getString(this.DEFAULT_DATABASE_NAME) lazy val partitionName = PropUtils.getString(this.DEFAULT_TABLE_PARTITION_NAME, this.defaultPartitionName) /** * 根据hive集群名称获取metastore地址 */ def getMetastoreUrl: String = { val confUrl = this.hiveMetastoreMap.getOrElse(hiveCluster, hiveCluster) if (StringUtils.isNotBlank(confUrl) && FireFrameworkConf.hiveMetastoreUrlRandomEnable) StringsUtils.randomSplit(confUrl, ",") else confUrl } /** * 获取hive-site.xml的存放路径 * * @return * /path/to/hive-site.xml */ def getHiveConfDir: String = { this.hiveSiteMap.getOrElse(hiveCluster, hiveCluster) } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/conf/FireKafkaConf.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.conf import com.zto.fire.common.util.{PropUtils, StringsUtils} /** * kafka相关配置 * * @author ChengLong * @since 1.1.0 * @create 2020-07-13 14:58 */ private[fire] object FireKafkaConf { lazy val offsetLargest = "latest" lazy val offsetSmallest = "earliest" lazy val offsetNone = "none" lazy val clusterMapConfStart = "fire.kafka.cluster.map." lazy val kafkaConfStart = "kafka.conf." lazy val KAFKA_BROKERS_NAME = "kafka.brokers.name" // kafka的topic列表,以逗号分隔 lazy val KAFKA_TOPICS = "kafka.topics" // group.id lazy val KAFKA_GROUP_ID = "kafka.group.id" // kafka起始消费位点 lazy val KAFKA_STARTING_OFFSET = "kafka.starting.offsets" // kafka结束消费位点 lazy val KAFKA_ENDING_OFFSET = "kafka.ending.offsets" // 是否自动维护offset lazy val KAFKA_ENABLE_AUTO_COMMIT = "kafka.enable.auto.commit" // 丢失数据是否失败 lazy val KAFKA_FAIL_ON_DATA_LOSS = "kafka.failOnDataLoss" // kafka session超时时间 lazy val KAFKA_SESSION_TIMEOUT_MS = "kafka.session.timeout.ms" // kafka request超时时间 lazy val KAFKA_REQUEST_TIMEOUT_MS = "kafka.request.timeout.ms" lazy val KAFKA_MAX_POLL_INTERVAL_MS = "kafka.max.poll.interval.ms" lazy val KAFKA_COMMIT_OFFSETS_ON_CHECKPOINTS = "kafka.CommitOffsetsOnCheckpoints" lazy val KAFKA_START_FROM_TIMESTAMP = "kafka.StartFromTimestamp" lazy val KAFKA_START_FROM_GROUP_OFFSETS = "kafka.StartFromGroupOffsets" // 是否使状态中存放的offset不生效(请谨慎配置,用于kafka集群迁移等不正常状况的运维) lazy val KAFKA_OVERWRITE_STATE_OFFSET = "kafka.force.overwrite.stateOffset.enable" // 是否在开启checkpoint的情况下强制开启周期性offset提交 lazy val KAFKA_FORCE_AUTO_COMMIT = "kafka.force.autoCommit.enable" // 周期性提交offset的时间间隔(ms) lazy val KAFKA_FORCE_AUTO_COMMIT_INTERVAL = "kafka.force.autoCommit.Interval" // 初始化kafka集群名称与地址映射 private[fire] lazy val kafkaMap = PropUtils.sliceKeys(clusterMapConfStart) // kafka消费起始位点 def kafkaStartingOffset(keyNum: Int = 1): String = PropUtils.getString(this.KAFKA_STARTING_OFFSET, "", keyNum) // kafka消费结束位点 def kafkaEndingOffsets(keyNum: Int = 1): String = PropUtils.getString(this.KAFKA_ENDING_OFFSET, "", keyNum) // 丢失数据时是否失败 def kafkaFailOnDataLoss(keyNum: Int = 1): Boolean = PropUtils.getBoolean(this.KAFKA_FAIL_ON_DATA_LOSS, true, keyNum) // enable.auto.commit def kafkaEnableAutoCommit(keyNum: Int = 1): Boolean = PropUtils.getBoolean(this.KAFKA_ENABLE_AUTO_COMMIT, false, keyNum) // 获取topic列表 def kafkaTopics(keyNum: Int = 1): String = PropUtils.getString(this.KAFKA_TOPICS, "", keyNum) // kafka session超时时间,默认5分钟 def kafkaSessionTimeOut(keyNum: Int = 1): java.lang.Integer = PropUtils.getInt(this.KAFKA_SESSION_TIMEOUT_MS, 300000, keyNum) // kafka request超时时间,默认10分钟 def kafkaPollInterval(keyNum: Int = 1): java.lang.Integer = PropUtils.getInt(this.KAFKA_MAX_POLL_INTERVAL_MS, 600000, keyNum) // kafka request超时时间 def kafkaRequestTimeOut(keyNum: Int = 1): java.lang.Integer = PropUtils.getInt(this.KAFKA_REQUEST_TIMEOUT_MS, 400000, keyNum) // 配置文件中的groupId def kafkaGroupId(keyNum: Int = 1): String = PropUtils.getString(this.KAFKA_GROUP_ID, "", keyNum) // 是否在checkpoint时记录offset值 def kafkaCommitOnCheckpoint(keyNum: Int = 1): Boolean = PropUtils.getBoolean(this.KAFKA_COMMIT_OFFSETS_ON_CHECKPOINTS, true, keyNum) // 设置从指定时间戳位置开始消费kafka def kafkaStartFromTimeStamp(keyNum: Int = 1): java.lang.Long = PropUtils.getLong(this.KAFKA_START_FROM_TIMESTAMP, 0L, keyNum) // 从topic中指定的group上次消费的位置开始消费,必须配置group.id参数 def kafkaStartFromGroupOffsets(keyNum: Int = 1): Boolean = PropUtils.getBoolean(this.KAFKA_START_FROM_GROUP_OFFSETS, false, keyNum) // kafka-client配置信息 def kafkaConfMap(keyNum: Int = 1): collection.immutable.Map[String, String] = PropUtils.sliceKeysByNum(kafkaConfStart, keyNum) // 是否使状态中存放的offset不生效 def kafkaForceOverwriteStateOffset: Boolean = PropUtils.getBoolean(this.KAFKA_OVERWRITE_STATE_OFFSET, false) // 是否在开启checkpoint的情况下强制开启周期性offset提交 def kafkaForceCommit: Boolean = PropUtils.getBoolean(this.KAFKA_FORCE_AUTO_COMMIT, false) // 周期性提交offset的时间间隔(ms) def kafkaForceCommitInterval: Long = PropUtils.getLong(this.KAFKA_FORCE_AUTO_COMMIT_INTERVAL, 30000) def kafkaConfMapWithType(keyNum: Int = 1): collection.immutable.Map[String, Object] = { val map = new collection.mutable.HashMap[String, Object]() this.kafkaConfMap(keyNum).foreach(kv => { map.put(kv._1, StringsUtils.parseString(kv._2)) }) map.toMap } /** * 根据名称获取kafka broker地址 */ def kafkaBrokers(keyNum: Int = 1): String = { val brokerName = PropUtils.getString(this.KAFKA_BROKERS_NAME, "", keyNum) this.kafkaBrokers(brokerName) } /** * 根据url或别名返回真实的url地址 */ def kafkaBrokers(url: String): String = { this.kafkaMap.getOrElse(url, url) } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/conf/FirePS1Conf.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.conf /** * 颜色预定义 * * @author ChengLong * @since 1.1.0 * @create 2020-07-13 15:01 */ private[fire] object FirePS1Conf { // 颜色相关 lazy val GREEN = "\u001B[32m" lazy val DEFAULT = "\u001B[0m" lazy val RED = "\u001B[31m" lazy val YELLOW = "\u001B[33m" lazy val BLUE = "\u001B[34m" lazy val PURPLE = "\u001B[35m" lazy val PINK = "\u001B[35m" // 字体相关 lazy val HIGH_LIGHT = "\u001B[1m" lazy val ITALIC = "\u001B[3m" lazy val UNDER_LINE = "\u001B[4m" lazy val FLICKER = "\u001B[5m" /** * 包裹处理 * * @param str * 原字符串 * @param ps1 * ps1 * @return * wrap后的字符串 */ def wrap(str: String, ps1: String*): String = { val printStr = new StringBuilder() ps1.foreach(ps => { printStr.append(ps) }) printStr.append(str + DEFAULT).toString() } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/conf/FireRocketMQConf.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.conf import com.zto.fire.common.util.PropUtils import org.apache.commons.lang3.StringUtils /** * RocketMQ相关配置 * * @author ChengLong * @since 1.1.0 * @create 2020-07-13 14:58 */ private[fire] object FireRocketMQConf { lazy val rocketOffsetLargest = "latest" lazy val rocketOffsetSmallest = "earliest" lazy val rocketConsumerTag = "*" lazy val rocketClusterMapConfStart = "fire.rocket.cluster.map." // 初始化kafka集群名称与地址映射 private[fire] lazy val rocketClusterMap = PropUtils.sliceKeys(rocketClusterMapConfStart) lazy val rocketConfStart = "rocket.conf." // rocketMQ name server lazy val ROCKET_BROKERS_NAME = "rocket.brokers.name" // rocketMQ topic信息,多个以逗号分隔 lazy val ROCKET_TOPICS = "rocket.topics" // rocketMQ groupId val ROCKET_GROUP_ID = "rocket.group.id" // 丢失数据是否失败 lazy val ROCKET_FAIL_ON_DATA_LOSS = "rocket.failOnDataLoss" lazy val ROCKET_FORCE_SPECIAL = "rocket.forceSpecial" // 是否自动维护offset lazy val ROCKET_ENABLE_AUTO_COMMIT = "rocket.enable.auto.commit" // RocketMQ起始消费位点 lazy val ROCKET_STARTING_OFFSET = "rocket.starting.offsets" // rocketMq订阅的tag lazy val ROCKET_CONSUMER_TAG = "rocket.consumer.tag" // 每次拉取每个partition的消息数 lazy val ROCKET_PULL_MAX_SPEED_PER_PARTITION = "rocket.pull.max.speed.per.partition" lazy val ROCKET_INSTANCE_ID = "rocket.consumer.instance" lazy val ROCKET_SINK_PARALLELISM = "rocket.sink.parallelism" // 用于标识消费者的名称 def rocketInstanceId(keyNum: Int = 1): String = PropUtils.getString(this.ROCKET_INSTANCE_ID, "", keyNum) // rocket-client配置信息 def rocketConfMap(keyNum: Int = 1): collection.immutable.Map[String, String] = PropUtils.sliceKeysByNum(rocketConfStart, keyNum) // 获取消费位点 def rocketStartingOffset(keyNum: Int = 1): String = PropUtils.getString(this.ROCKET_STARTING_OFFSET, "", keyNum) // 丢失数据时是否失败 def rocketFailOnDataLoss(keyNum: Int = 1): Boolean = PropUtils.getBoolean(this.ROCKET_FAIL_ON_DATA_LOSS, true, keyNum) // spark.rocket.forceSpecial def rocketForceSpecial(keyNum: Int = 1): Boolean = PropUtils.getBoolean(this.ROCKET_FORCE_SPECIAL, false, keyNum) // enable.auto.commit def rocketEnableAutoCommit(keyNum: Int = 1): Boolean = PropUtils.getBoolean(this.ROCKET_ENABLE_AUTO_COMMIT, false, keyNum) // 获取rocketMQ 订阅的tag def rocketConsumerTag(keyNum: Int = 1): String = PropUtils.getString(this.ROCKET_CONSUMER_TAG, "", keyNum) // 获取groupId def rocketGroupId(keyNum: Int = 1): String = PropUtils.getString(this.ROCKET_GROUP_ID, "", keyNum) // 获取rocket topic列表 def rocketTopics(keyNum: Int = 1): String = PropUtils.getString(this.ROCKET_TOPICS, null, keyNum) // 每次拉取每个partition的消息数 def rocketPullMaxSpeedPerPartition(keyNum: Int = 1): String = PropUtils.getString(this.ROCKET_PULL_MAX_SPEED_PER_PARTITION, "", keyNum) // sink rocketmq的并行度 def rocketSinkParallelism(keyNum: Int = 1): Int = PropUtils.getInt(this.ROCKET_SINK_PARALLELISM, -1, keyNum) // 获取rocketMQ name server 地址 def rocketNameServer(keyNum: Int = 1): String = { val brokerName = PropUtils.getString(this.ROCKET_BROKERS_NAME, "", keyNum) this.rocketNameServer(brokerName) } /** * 根据url或别名返回真实的url地址 */ def rocketNameServer(url: String): String = { this.rocketClusterMap.getOrElse(url, url) } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/conf/KeyNum.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.conf /** * 用于定义一些常量,约定代码中keyNum参数与配置文件key的末尾数字相匹配 * * @author ChengLong * @Date 2022-04-30 14:24:41 * @since 2.2.1 */ object KeyNum { lazy val _1 = 1 lazy val _2 = 2 lazy val _3 = 3 lazy val _4 = 4 lazy val _5 = 5 lazy val _6 = 6 lazy val _7 = 7 lazy val _8 = 8 lazy val _9 = 9 lazy val _11 = 11 lazy val _12 = 12 lazy val _13 = 13 lazy val _14 = 14 lazy val _15 = 15 lazy val _16 = 16 lazy val _17 = 17 lazy val _18 = 18 lazy val _19 = 19 lazy val _20 = 20 lazy val _21 = 21 lazy val _22 = 22 lazy val _23 = 23 lazy val _24 = 24 lazy val _25 = 25 lazy val _26 = 26 lazy val _27 = 27 lazy val _28 = 28 lazy val _29 = 29 lazy val _30 = 30 } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/ext/JavaExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.ext import com.zto.fire.predef._ /** * Java语法扩展 * * @author ChengLong * @since 2.0.0 * @create 2021-01-04 13:50 */ trait JavaExt { /** * Java map API扩展 */ implicit class MapExt[K, V](map: JMap[K, V]) { /** * map的get操作,如果map中存在则直接返回,否则会根据fun定义的逻辑进行value的初始化 * 注:fun中定义的逻辑仅会在key对应的value不存在时被调用一次 * * @param key map的key * @param fun 用于定义key对应value的初始化逻辑 * @return map中key对应的value */ def mergeGet(key: K)(fun: => V): V = { requireNonEmpty(key) if (!map.containsKey(key)) map.put(key, fun) map.get(key) } } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/ext/ScalaExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.ext import com.zto.fire.common.conf.FireFrameworkConf import java.util.regex.Pattern /** * scala相关扩展 * * @author ChengLong * @since 2.0.0 * @create 2021-01-04 10:32 */ trait ScalaExt { // 用于缓存转为驼峰标识的字符串与转换前的字符串的映射关系 private[this] lazy val humpMap = collection.mutable.Map[String, String]() private[this] var printCount = 0L /** * String API扩展 */ implicit class StringExt[K, V](str: String) { // 用于匹配带有下划线字符串的正则 private[this] lazy val humpPattern = Pattern.compile("(.*)_(\\w)(.*)") private[this] lazy val maxHumpMapSize = 10000 /** * 数据表字段名转换为驼峰式名字的实体类属性名 * * @return 转换后的驼峰式命名 */ def toHump: String = { val matcher = humpPattern.matcher(str) val humpStr = if (matcher.find) { (matcher.group(1) + matcher.group(2).toUpperCase + matcher.group(3)).toHump } else str if (humpMap.size <= this.maxHumpMapSize) humpMap += (humpStr -> str) humpStr } /** * 驼峰式的实体类属性名转换为数据表字段名 * * @return 转换后的以"_"分隔的数据表字段名 */ def unHump: String = humpMap.getOrElse(str, str.replaceAll("[A-Z]", "_$0").toLowerCase) } /** * print行数限制 */ private[this] def printLimit(x: Any)(fun: Any => Unit): Unit = { this.printCount += 1 if (FireFrameworkConf.printLimit <= 0 || this.printCount <= FireFrameworkConf.printLimit) { fun(x) } else if (this.printCount <= FireFrameworkConf.printLimit * 1.1){ Console.println(s"使用print打印行数超过fire.print.limit配置的${FireFrameworkConf.printLimit}条,生产环境请不要打印过多数据!") } } /** Prints an object to `out` using its `toString` method. * * @param x the object to print; may be null. * @group console-output */ def print(x: Any): Unit = this.printLimit(x)(x => Console.print(x)) /** Prints out an object to the default output, followed by a newline character. * * @param x the object to print. * @group console-output */ def println(x: Any): Unit = this.printLimit(x)(x => Console.println(x)) } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/package.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire import com.zto.fire.common.util.Tools /** * 预定义通用常用的api * * @author ChengLong 2020-12-8 15:15:00 */ package object predef extends Tools ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/ConfigurationCenterManager.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import com.zto.fire.common.bean.config.ConfigurationParam import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.enu.ConfigureLevel import com.zto.fire.predef._ import org.apache.commons.lang3.StringUtils import org.slf4j.LoggerFactory /** * 配置中心管理器,用于读取配置中心中的配置信息 * * @author ChengLong * @since 2.0.0 * @create 2021-03-12 13:35 */ private[fire] object ConfigurationCenterManager extends Serializable with Logging { private lazy val configCenterProperties: JMap[ConfigureLevel, JMap[String, String]] = new JHashMap[ConfigureLevel, JMap[String, String]] /** * 构建配置中心请求参数 * * @param className * 当前任务主类名 */ private[this] def buildRequestParam(className: String): String = { val rest = FireFrameworkConf.fireRestUrl if (StringUtils.isBlank(rest)) this.logger.warn("Fire Rest Server 地址为空,将无法完成注册") s""" |{"className": "${className.replace("$", "")}", "url": "$rest", "fireVersion": "${FireFrameworkConf.fireVersion}", "zrcKey": "${FireFrameworkConf.configCenterSecret}", "engine": "${PropUtils.engine}"} """.stripMargin } /** * 通过参数调用指定的接口 */ private[this] def invoke(url: String, param: String): String = { try { HttpClientUtils.doPost(url, param) } catch { case _: Throwable => this.logger.error("调用配置中心接口失败,开始尝试调用测试环境配置中心接口。") "" } } /** * 调用外部配置中心接口获取配合信息 */ def invokeConfigCenter(className: String): JMap[ConfigureLevel, JMap[String, String]] = { if (!FireFrameworkConf.configCenterEnable || (OSUtils.isLocal && !FireFrameworkConf.configCenterLocalEnable)) return this.configCenterProperties val param = buildRequestParam(className) // 尝试从生产环境配置中心获取参数列表 var json = this.invoke(FireFrameworkConf.configCenterProdAddress, param) // 如果生产环境接口调用失败,可能存在网络隔离,则从测试环境配置中心获取参数列表 if (isEmpty(json)) json = this.invoke(FireFrameworkConf.configCenterTestAddress, param) if (isEmpty(json)) { // 考虑到任务的重要配置可能存放在配置中心,在接口不通的情况下发布任务存在风险,因此会强制任务退出 this.logger.error("配置中心注册接口不可用导致任务发布失败。如仍需紧急发布,请确保任务配置与配置中心保存一直,并在common.properties中添加以下参数:fire.config_center.enable=false") System.exit(-1) } else { if (FireFrameworkConf.fireConfShow) this.logger.info(s"成功获取配置中心配置信息:$json") val param = JSONUtils.parseObject[ConfigurationParam](json) if (noEmpty(param, param.getCode, param.getContent) && param.getCode == 200) { this.configCenterProperties.putAll(param.getContent) this.logger.debug("配置中心参数已生效") } } this.configCenterProperties } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/DateFormatUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.time.DateUtils import java.text.SimpleDateFormat import java.util.{Calendar, Date, TimeZone} import scala.collection.mutable.ArrayBuffer /** * 日期格式化工具类 * Created by ChengLong on 2016-11-24. */ object DateFormatUtils extends Logging { lazy val yyyyMMdd = "yyyyMMdd" lazy val yyyy_MM_dd = "yyyy-MM-dd" lazy val yyyyMMddHH = "yyyyMMddHH" lazy val yyyy_MM_ddHHmmss = "yyyy-MM-dd HH:mm:ss" lazy val TRUNCATE_MIN = "yyyy-MM-dd HH:mm:00" private val timeZoneShangHai = "Asia/Shanghai" lazy val HOUR = "hour" lazy val DAY = "day" lazy val WEEK = "week" lazy val MONTH = "month" lazy val YEAR = "year" lazy val MINUTE = "minute" lazy val SECOND = "second" lazy val enumSet = Set(HOUR, DAY, WEEK, MONTH, YEAR, MINUTE, SECOND) /** * 将日期格式化为 yyyy-MM-dd HH:mm:ss */ def getTimeFormat(): SimpleDateFormat = { val timeFormat: SimpleDateFormat = new SimpleDateFormat(DateFormatUtils.yyyy_MM_ddHHmmss) timeFormat.setTimeZone(TimeZone.getTimeZone(timeZoneShangHai)) timeFormat } /** * 给定yyyy-MM-dd HH:mm:ss 格式数据,返回yyyy-MM-dd */ def getDateFromDateTimeStr(dateTime: String) = { if (StringUtils.isNotBlank(dateTime) && dateTime.length() > 10) { dateTime.substring(0, 10) } else { dateTime } } /** * 给定yyyy-MM-dd HH:mm:ss 格式数据,返回yyyyMMdd格式的时间分区 */ def getPartitionDate(dateTime: String): String = { this.getDateFromDateTimeStr(dateTime).replace("-", "") } /** * 将日期格式化为 yyyy-MM-dd */ def getDateFormat(): SimpleDateFormat = { this.getSchemaFormat() } /** * 将日期格式化为 yyyy-MM-dd */ def getSchemaFormat(schema: String = DateFormatUtils.yyyy_MM_dd): SimpleDateFormat = { val dateFormat: SimpleDateFormat = new SimpleDateFormat(schema) dateFormat.setTimeZone(TimeZone.getTimeZone(timeZoneShangHai)) dateFormat } /** * 格式化Date为yyyy-MM-dd格式的字符串 */ def formatDate(date: Date): String = { this.getDateFormat().format(date) } /** * 将日期格式化为 yyyy-MM-dd hh:mm:ss 格式的字符串 */ def formatDateTime(dateTime: Date): String = { if (dateTime != null) this.getTimeFormat().format(dateTime) else "" } /** * 将指定时间转为指定schema的格式 * * @param dateTime * 指定时间 * @return */ def formatBySchema(dateTime: Date, schema: String): String = { if (dateTime != null) this.getSchemaFormat(schema).format(dateTime) else "" } /** * 将字符串格式化为yyyy-MM-dd的日期 */ def formatDate(date: String): Date = { this.getDateFormat().parse(date) } /** * 将字符串格式化为yyyy-MM-dd hh:mm:ss的日期 */ def formatDateTime(dateTime: String): Date = { this.getTimeFormat().parse(dateTime) } /** * 将当期系统时间格式化为yyyy-MM-dd 并返回字符串 */ def formatCurrentDate(): String = { this.formatDate(new Date) } /** * 将当期系统时间格式化为yyyy-MM-dd hh:mm:ss并返回字符串 */ def formatCurrentDateTime(): String = { this.formatDateTime(new Date) } /** * 转换当前时间为指定的时间格式 * * @param schema * 指定的schema */ def formatCurrentBySchema(schema: String): String = { this.formatBySchema(new Date, schema) } /** * 将指定的unix元年时间转为yyyy-MM-dd 的字符串 */ def formatUnixDate(date: Long): String = { this.formatDate(new Date(date)) } /** * 将指定的unix元年时间转为yyyy-MM-dd hh:mm:ss 的字符串 */ def formatUnixDateTime(dateTime: Long): String = { this.formatDateTime(new Date(dateTime)) } /** * 对日期进行格式转换 */ def dateSchemaFormat(dateTimeStr: String, srcSchema: String, destSchema: String): String = { if (StringUtils.isBlank(dateTimeStr)) { return dateTimeStr } val timeFormat: SimpleDateFormat = new SimpleDateFormat(srcSchema) timeFormat.setTimeZone(TimeZone.getTimeZone(timeZoneShangHai)) val datetime = timeFormat.parse(dateTimeStr) timeFormat.applyPattern(destSchema) timeFormat.format(datetime) } /** * 对日期进行格式转换 */ def dateSchemaFormat(dateTime: Date, srcSchema: String, destSchema: String): Date = { val timeFormat: SimpleDateFormat = new SimpleDateFormat(srcSchema) timeFormat.setTimeZone(TimeZone.getTimeZone(timeZoneShangHai)) val dateTimeStr = timeFormat.format(dateTime) timeFormat.applyPattern(destSchema) timeFormat.parse(dateTimeStr) } /** * 判断两个日期是否为同一天 */ def isSameDay(day1: String, day2: String): Boolean = { if (StringUtils.isNotBlank(day1) && StringUtils.isNotBlank(day2)) { val format = this.getTimeFormat() DateUtils.isSameDay(format.parse(day1), format.parse(day2)) } else { false } } /** * 判断两个日期是否为同一天 */ def isSameDay(day1: Date, day2: Date): Boolean = { DateUtils.isSameDay(day1, day2) } /** * 用于判断给定的时间是否和系统时间处于同一天 */ def isSameDay(date: String): Boolean = { try { DateUtils.isSameDay(new Date(), this.getTimeFormat().parse(date)) } catch { case e: Exception => { logger.error("isSameDay判断失败", e) false } } } /** * 判断两个日期是否为同一小时(前提是同一天) */ def isSameHour(day1: String, day2: String): Boolean = { if (StringUtils.isNotBlank(day1) && StringUtils.isNotBlank(day2)) { val format = this.getTimeFormat() val d1 = format.parse(day1) val d2 = format.parse(day2) if (this.isSameDay(d1, d2)) { d1.getHours == d2.getHours } else { false } } else { false } } /** * 判断两个日期是否为同一小时(前提是同一天) */ def isSameHour(day1: Date, day2: Date): Boolean = { if (this.isSameDay(day1, day2)) { day1.getHours == day2.getHours } else { false } } /** * 判断两个日期是否为同一星期(必须是同年同月) */ def isSameWeek(day1: Date, day2: Date): Boolean = { if (this.isSameYear(day1, day2) && this.isSameMonth(day1, day2)) { val cal = Calendar.getInstance() cal.setTimeZone(TimeZone.getTimeZone(timeZoneShangHai)) cal.setTime(day1) val week1 = cal.get(Calendar.DAY_OF_WEEK_IN_MONTH) cal.setTime(day2) week1 == cal.get(Calendar.DAY_OF_WEEK_IN_MONTH) } else { false } } /** * 判断两个日期是否为同一星期(必须是同年同月) */ def isSameWeek(day1: String, day2: String): Boolean = { if (StringUtils.isNotBlank(day1) && StringUtils.isNotBlank(day2)) { val format = this.getTimeFormat() val d1 = format.parse(day1) val d2 = format.parse(day2) this.isSameWeek(d1, d2) } else { false } } /** * 判断两个日期是否为同一月份 */ def isSameMonth(day1: Date, day2: Date): Boolean = { day1.getMonth == day2.getMonth } /** * 判断两个日期是否为同一月份 */ def isSameMonth(day1: String, day2: String): Boolean = { val format = this.getTimeFormat() val d1 = format.parse(day1) val d2 = format.parse(day2) this.isSameMonth(d1, d2) } /** * 判断两个日期是否为同一年 */ def isSameYear(day1: Date, day2: Date): Boolean = { day1.getYear == day2.getYear } /** * 判断两个日期是否为同一年 */ def isSameYear(day1: String, day2: String): Boolean = { val format = this.getTimeFormat() val d1 = format.parse(day1) val d2 = format.parse(day2) this.isSameYear(d1, d2) } /** * day1是否大于day2 */ def isBig(day1: String, day2: String): Boolean = { if (StringUtils.isNotBlank(day1) && StringUtils.isNotBlank(day2)) { DateFormatUtils.formatDateTime(day1).after(DateFormatUtils.formatDateTime(day2)) } else if (StringUtils.isNotBlank(day1) && StringUtils.isBlank(day2)) { true } else if (StringUtils.isBlank(day1) && StringUtils.isNotBlank(day2)) { false } else { true } } /** * day1是否小于day2 */ def isSmall(day1: String, day2: String): Boolean = { !this.isBig(day1, day2) } /** * day 是否介于day1与day2之间 */ def isBetween(day: String, day1: String, day2: String) = { this.isSmall(day, day2) && this.isBig(day, day1) } /** * 指定时间字段,对日期进行加减 * * @param field * 'year'、'month'、'day'、'hour'、'minute'、'second' * @param dateTimeStr * 格式:yyyy-MM-dd hh:mm:ss * @param count * 正负数 * @return * 计算后的日期 */ def addTimer(field: String, dateTimeStr: String, count: Int): String = { if (this.YEAR.equalsIgnoreCase(field)) { this.addYears(dateTimeStr, count) } else if (this.MONTH.equalsIgnoreCase(field)) { this.addMons(dateTimeStr, count) } else if (this.DAY.equalsIgnoreCase(field)) { this.addDays(dateTimeStr, count) } else if (this.HOUR.equalsIgnoreCase(field)) { this.addHours(dateTimeStr, count) } else if (this.MINUTE.equalsIgnoreCase(field)) { this.addMins(dateTimeStr, count) } else if (this.SECOND.equalsIgnoreCase(field)) { this.addSecs(dateTimeStr, count) } else { "" } } /** * 对指定的时间字段进行年度加减 */ def addYears(dateTimeStr: String, years: Int): String = { if (StringUtils.isNotBlank(dateTimeStr) && !"null".equals(dateTimeStr) && !"NULL".equals(dateTimeStr)) { val datetime = DateFormatUtils.formatDateTime(dateTimeStr) DateFormatUtils.formatDateTime(DateUtils.addYears(datetime, years)) } else { "" } } /** * 对指定的时间字段进行年度加减 */ def addYears(dateTime: Date, years: Int): String = { if (dateTime != null) { DateFormatUtils.formatDateTime(DateUtils.addYears(dateTime, years)) } else { "" } } /** * 对指定的时间字段进行月份加减 */ def addMons(dateTimeStr: String, mons: Int): String = { if (StringUtils.isNotBlank(dateTimeStr) && !"null".equals(dateTimeStr) && !"NULL".equals(dateTimeStr)) { val datetime = DateFormatUtils.formatDateTime(dateTimeStr) DateFormatUtils.formatDateTime(DateUtils.addMonths(datetime, mons)) } else { "" } } /** * 对指定的时间字段进行月份加减 */ def addMons(dateTime: Date, mons: Int): String = { if (dateTime != null) { DateFormatUtils.formatDateTime(DateUtils.addMonths(dateTime, mons)) } else { "" } } /** * 对指定日期增加天 */ def addDays(dateTimeStr: String, days: Int): String = { if (StringUtils.isNotBlank(dateTimeStr) && !"null".equals(dateTimeStr) && !"NULL".equals(dateTimeStr)) { val datetime = DateFormatUtils.formatDateTime(dateTimeStr) DateFormatUtils.formatDateTime(DateUtils.addDays(datetime, days)) } else { "" } } /** * 对指定日期增加天 */ def addDays(dateTime: Date, days: Int): String = { if (dateTime != null) { DateFormatUtils.formatDateTime(DateUtils.addDays(dateTime, days)) } else { "" } } /** * 对指定日期增加天,并以指定的格式返回 */ def addPartitionDays(dateTime: Date, days: Int, schema: String = "yyyyMMdd"): String = { if (dateTime != null) { DateFormatUtils.formatBySchema(DateUtils.addDays(dateTime, days), schema) } else { "" } } /** * 对指定的时间字段进行天加减 */ def addWeeks(dateTimeStr: String, weeks: Int): String = { if (StringUtils.isNotBlank(dateTimeStr) && !"null".equals(dateTimeStr) && !"NULL".equals(dateTimeStr)) { val datetime = DateFormatUtils.formatDateTime(dateTimeStr) DateFormatUtils.formatDateTime(DateUtils.addWeeks(datetime, weeks)) } else { "" } } /** * 对指定的时间字段进行天加减 */ def addWeeks(dateTime: Date, weeks: Int): String = { if (dateTime != null) { DateFormatUtils.formatDateTime(DateUtils.addWeeks(dateTime, weeks)) } else { "" } } /** * 对指定的时间字段进行小时加减 */ def addHours(dateTimeStr: String, hours: Int): String = { if (StringUtils.isNotBlank(dateTimeStr) && !"null".equals(dateTimeStr) && !"NULL".equals(dateTimeStr)) { val datetime = DateFormatUtils.formatDateTime(dateTimeStr) DateFormatUtils.formatDateTime(DateUtils.addHours(datetime, hours)) } else { "" } } /** * 对指定的时间字段进行小时加减 */ def addHours(dateTime: Date, hours: Int): String = { if (dateTime != null) { DateFormatUtils.formatDateTime(DateUtils.addHours(dateTime, hours)) } else { "" } } /** * 对指定的时间字段进行分钟加减 */ def addMins(dateTimeStr: String, minutes: Int): String = { if (StringUtils.isNotBlank(dateTimeStr) && !"null".equals(dateTimeStr) && !"NULL".equals(dateTimeStr)) { val datetime = DateFormatUtils.formatDateTime(dateTimeStr) DateFormatUtils.formatDateTime(DateUtils.addMinutes(datetime, minutes)) } else { "" } } /** * 对指定的时间字段进行分钟加减 */ def addMins(dateTime: Date, minutes: Int): String = { if (dateTime != null) { DateFormatUtils.formatDateTime(DateUtils.addMinutes(dateTime, minutes)) } else { "" } } /** * 对指定的时间字段进行秒钟加减 */ def addSecs(dateTimeStr: String, seconds: Int): String = { if (StringUtils.isNotBlank(dateTimeStr) && !"null".equals(dateTimeStr) && !"NULL".equals(dateTimeStr)) { val datetime = DateFormatUtils.formatDateTime(dateTimeStr) DateFormatUtils.formatDateTime(DateUtils.addSeconds(datetime, seconds)) } else { "" } } /** * 对指定的时间字段进行秒钟加减 */ def addSecs(dateTime: Date, seconds: Int): String = { if (dateTime != null) { DateFormatUtils.formatDateTime(DateUtils.addSeconds(dateTime, seconds)) } else { "" } } /** * 获取day1到day2之间的所有日期 * * @param prefix * 指定拼接前缀 */ def getBetweenDate(prefix: String, day1: String, day2: String): Array[String] = { val dates = ArrayBuffer[String]() var nextDay = this.addDays(day1, 1) if (this.isBetween(nextDay, day1, day2)) { dates += s"$prefix >= to_date('$day1','yyyy-mm-dd hh24:mi:ss') and $prefix < to_date('$nextDay','yyyy-mm-dd hh24:mi:ss')" } while (this.isBetween(nextDay, day1, day2)) { var tmpDay = "" tmpDay = this.addDays(nextDay, 1) dates += s"$prefix >= to_date('$nextDay','yyyy-mm-dd hh24:mi:ss') and $prefix < to_date('$tmpDay','yyyy-mm-dd hh24:mi:ss')" nextDay = tmpDay } dates.toArray } /** * 计算date1与date2之间相差的小时数 * @return * 相差的小时数 */ def betweenHours(date1: Date, date2: Date): Double = { (date1.getTime - date2.getTime) / 3600000.0 } /** * 将yyyy-MM-dd hh:mm:ss类型日期truncate为月初零点 */ def truncateMonth(dateTime: Date): String = { val cal = Calendar.getInstance() if (dateTime != null) cal.setTime(dateTime) val year = cal.get(Calendar.YEAR) val month = cal.get(Calendar.MONTH) + 1 if (month < 10) year + "-0" + month + "-01 00:00:00" else year + "-" + month + "-01 00:00:00" } /** * 取年月日 */ def getyyyyMMdd(dataTime: String): String = { if (StringUtils.isNotBlank(dataTime) && dataTime.length >= 10) { dataTime.substring(0, 10) } else { dataTime } } /** * 取年月日 */ def getyyyyMM(dataTime: String): String = { if (StringUtils.isNotBlank(dataTime) && dataTime.length >= 7) { dataTime.substring(0, 7) } else { dataTime } } /** * 取年月日 */ def getyyyy(dataTime: String): String = { if (StringUtils.isNotBlank(dataTime) && dataTime.length >= 4) { dataTime.substring(0, 4) } else { dataTime } } /** * 获取指定日期的月初时间,如为空则返回系统当前时间对应的月初 */ def truncateMonthStr(dateTime: String): String = { var dateTimeStr = dateTime if (StringUtils.isBlank(dateTimeStr)) { dateTimeStr = this.getTimeFormat().format(new Date) } this.truncateMonth(this.formatDate(dateTimeStr)) } /** * 根据指定的时间和格式,将时间格式化为hive分区格式 */ def getPartitionTime(dateTime: String = this.formatCurrentDateTime(), schema: String = DateFormatUtils.yyyyMMdd): String = { this.dateSchemaFormat(dateTime, DateFormatUtils.yyyy_MM_ddHHmmss, schema) } /** * 将当前系统时间格式化为指定的格式作为分区 */ def getCurrentPartitionTime(schema: String = DateFormatUtils.yyyyMMdd): String = { getPartitionTime(this.formatCurrentDateTime(), schema) } /** * 获取两个时间间隔的毫秒数 */ def interval(before: Date, after: Date): Long = { after.getTime - before.getTime } /** * 获取两个时间间隔的毫秒数 */ def interval(before: String, after: String): Long = { this.formatDateTime(after).getTime - this.formatDateTime(before).getTime } /** * 将yyyy-MM-dd hh:mm:ss类型日期truncate为整点分钟 */ def truncateMinute(dateTime: String): String = { val date = this.formatDateTime(dateTime) val prefix = this.dateSchemaFormat(dateTime, DateFormatUtils.yyyy_MM_ddHHmmss, "yyyy-MM-dd HH") val minute = date.getMinutes if (minute >= 0 && minute < 10) { s"$prefix:00" } else if (minute >= 10 && minute < 20) { s"$prefix:10" } else if (minute >= 20 && minute < 30) { s"$prefix:20" } else if (minute >= 30 && minute < 40) { s"$prefix:30" } else if (minute >= 40 && minute < 50) { s"$prefix:40" } else { s"$prefix:50" } } /** * 将yyyy-MM-dd hh:mm:ss类型日期truncate为整点分钟 */ def truncateMinute(dateTime: Date): String = { this.truncateMinute(this.formatDateTime(dateTime)) } /** * 获取整点小时 */ def truncateHour(dateStr: String): String = { this.dateSchemaFormat(dateStr, DateFormatUtils.yyyy_MM_ddHHmmss, DateFormatUtils.yyyyMMddHH) } /** * 截取指定时间指定的位数 * * @param date * 日期 * @param cron * 切分的范围 * @param replace * 是否替换掉日期字符串中的特殊字符 * @return */ def truncate(date: String, cron: String = this.DAY, replace: Boolean = true): String = { if (StringUtils.isBlank(date) || StringUtils.isBlank(cron) || date.length != 19) { throw new IllegalArgumentException("日期不能为空,格式为yyyy-MM-dd HH:mm:ss") } if (!this.enumSet.contains(cron)) { throw new IllegalArgumentException("where参数必须是hour/day/week/month/year中的一个") } val index: Int = if (this.HOUR.equals(cron)) { 13 } else if (this.DAY.equals(cron)) { 10 } else if (this.MONTH.equals(cron)) { 7 } else if (this.MINUTE.equals(cron)) { 15 } else { 4 } if (replace) date.substring(0, index).replace("-", "").replace(":", "").replace(" ", "") else date.substring(0, index) } /** * 截取指定时间指定的位数 * * @param date * 日期 * @param cron * 切分的范围 * @param replace * 是否替换掉日期字符串中的特殊字符 * @return */ def truncate(date: Date, cron: String, replace: Boolean): String = { this.truncate(this.formatDateTime(date), cron, replace) } /** * 截取系统时间指定的位数 * * @param cron * 切分的范围 * @param replace * 是否替换掉日期字符串中的特殊字符 * @return */ def truncate(cron: String, replace: Boolean): String = { this.truncate(this.formatCurrentDateTime(), cron, replace) } /** * 判断给定的时间的秒位的个位是否为0秒,如00/10/20/30/40/60/60 */ def isSecondDivisibleZero(date: Date = new Date): Boolean = { val cal = Calendar.getInstance() cal.setTimeZone(TimeZone.getTimeZone(timeZoneShangHai)) cal.setTime(date) cal.get(Calendar.SECOND) % 10 == 0 } /** * 判断给定的时间的秒位的个位是否为0秒,如00/10/20/30/40/60/60 */ def isSecondDivisibleZero(dateTime: String): Boolean = { this.isSecondDivisibleZero(this.formatDateTime(dateTime)) } /** * 判断给定的时间的秒位是否为00秒 */ def isZeroSecond(date: Date = new Date): Boolean = { val cal = Calendar.getInstance() cal.setTimeZone(TimeZone.getTimeZone(timeZoneShangHai)) cal.setTime(date) cal.get(Calendar.SECOND) == 0 } /** * 判断给定的时间的秒位是否为00秒 */ def isZeroSecond(dateTime: String): Boolean = { this.isZeroSecond(this.formatDateTime(dateTime)) } /** * 判断给定的时间的分钟位是否为00分 */ def isZeroMinute(date: Date = new Date): Boolean = { if (this.isZeroSecond(date)) { val cal = Calendar.getInstance() cal.setTimeZone(TimeZone.getTimeZone(timeZoneShangHai)) cal.setTime(date) cal.get(Calendar.MINUTE) == 0 } else { false } } /** * 判断给定的时间的分钟位是否为00分 */ def isZeroMinute(dateTime: String): Boolean = { this.isZeroMinute(this.formatDateTime(dateTime)) } /** * 判断给定的时间的小时位是否为00时 */ def isZeroHour(date: Date = new Date): Boolean = { if (this.isZeroMinute(date)) { val cal = Calendar.getInstance() cal.setTimeZone(TimeZone.getTimeZone(timeZoneShangHai)) cal.setTime(date) cal.get(Calendar.HOUR_OF_DAY) == 0 } else { false } } /** * 判断给定的时间的小时位是否为00时 */ def isZeroHour(dateTime: String): Boolean = { this.isZeroHour(this.formatDateTime(dateTime)) } /** * 获取系统当前时间,精确到秒 */ def currentTime: Long = { System.currentTimeMillis() / 1000 } /** * 计算运行时长 */ def runTime(startTime: Long): String = { val currentTime = this.currentTime val apartTime = currentTime - startTime val hours = apartTime / 3600 val hoursStr = if (hours < 10) s"0${hours}" else s"${hours}" val minutes = apartTime / 60 - hours * 60 val minutesStr = if (minutes < 10) s"0${minutes}" else s"${minutes}" val seconds = apartTime - minutes * 60 - hours * 60 * 60 val secondsStr = if (seconds < 10) s"0${seconds}" else s"${seconds}" s"${hoursStr}时 ${minutesStr}分 ${secondsStr}秒" } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/ExceptionBus.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import com.google.common.collect.EvictingQueue import com.zto.fire.common.anno.Internal import com.zto.fire.common.bean.analysis.ExceptionMsg import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.predef._ import org.slf4j.Logger import java.util.concurrent.TimeUnit import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger, AtomicLong} /** * Fire框架异常总线,用于收集各引擎执行task过程中发生的异常信息 * * @author ChengLong * @since 1.1.2 * @create 2020-11-16 09:33 */ object ExceptionBus extends Logging { // 用于保存收集而来的异常对象 @transient private[this] lazy val queue = EvictingQueue.create[(String, Throwable, String)](FireFrameworkConf.exceptionBusSize) // 队列大小,对比queue.size有性能优势 private[fire] lazy val queueSize = new AtomicInteger(0) // 异常总数计数器 private[fire] lazy val exceptionCount = new AtomicLong(0) private[this] lazy val isStarted = new AtomicBoolean(false) this.sendToMQ /** * 周期性将异常堆栈信息发送到指定的MQ中,用于平台异常诊断 */ @Internal def sendToMQ: Unit = { if (!FireFrameworkConf.exceptionTraceEnable) return // 启动异步线程,定时将异常信息发送到指定的消息队列 if (this.isStarted.compareAndSet(false, true)) { ThreadUtils.scheduleAtFixedRate({ this.postException }, 0, 3, TimeUnit.SECONDS) // 注册回调,在jvm退出前将所有异常发送到mq中 MQProducer.addHook(ShutdownHookManager.HEIGHT_PRIORITY)(postException) } } /** * 将异常信息投递到MQ中 */ private[this] def postException: Unit = { val mqUrl = FireFrameworkConf.exceptionTraceMQ val mqTopic = FireFrameworkConf.exceptionTraceMQTopic if (isEmpty(mqUrl, mqTopic)) return val msg = this.getAndClear if (msg._1.nonEmpty) { msg._1.foreach(t => { MQProducer.send(mqUrl, mqTopic, new ExceptionMsg(t._2, t._3).toString) }) logger.debug(s"异常诊断:本轮发送异常共计${msg._1.size}个.") } } /** * 向异常总线中投递异常对象 */ def post(t: Throwable, sql: String = ""): Boolean = this.synchronized { exceptionCount.incrementAndGet() this.queue.offer((DateFormatUtils.formatCurrentDateTime(), t, sql)) } /** * 获取并清空queue * * @return 异常集合 */ @Internal private[fire] def getAndClear: (List[(String, Throwable, String)], Long) = this.synchronized { val list = this.queue.toList this.queue.clear() queueSize.set(0) this.logger.debug(s"成功收集异常总线中的异常对象共计:${list.size}条,异常总线将会被清空.") (list, this.exceptionCount.get()) } /** * 工具方法,用于打印异常信息 */ @Internal private[fire] def offAndLogError(logger: Logger, msg: String, t: Throwable, sql: String = ""): Unit = { this.post(t, sql) if (noEmpty(msg)) { if (logger != null) logger.error(msg, t) else t.printStackTrace() } } /** * 获取Throwable的堆栈信息 */ def stackTrace(t: Throwable): String = { if (t == null) return "" val stackTraceInfo = new StringBuilder() stackTraceInfo.append(t.toString + "\n") t.getStackTrace.foreach(trace => stackTraceInfo.append("\tat " + trace + "\n")) stackTraceInfo.toString } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/FireFunctions.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import com.zto.fire.common.conf.FirePS1Conf import com.zto.fire.common.util.UnitFormatUtils.{TimeUnitEnum, readable} import org.apache.commons.lang3.StringUtils import org.slf4j.Logger import scala.util.Try /** * 常用的函数库 * * @author ChengLong * @since 1.0.0 * @create 2020-12-16 15:45 */ trait FireFunctions extends Serializable with Logging { private[this] lazy val tryLog = "" private[this] lazy val catchLog = "执行try的过程中发生异常" private[this] lazy val finallyCatchLog = "执行finally过程中发生异常" /** * 重试指定的函数fn retryNum次 * 当fn执行失败时,会根据设置的重试次数自动重试retryNum次 * 每次重试间隔等待duration(毫秒) * * @param retryNum * 指定重试的次数 * @param duration * 重试的间隔时间(ms) * @param fun * 重试的函数或方法 * @tparam T * fn执行后返回的数据类型 * @return * 返回fn执行结果 */ def retry[T](retryNum: Long = 3, duration: Long = 3000)(fun: => T): T = { var count = 1L def redo[T](retryNum: Long, duration: Long)(fun: => T): T = { Try { fun } match { case util.Success(x) => x case _ if retryNum > 1 => { Thread.sleep(duration) count += 1 logger.info(s"${FirePS1Conf.RED}第${count}次执行. 时间:${DateFormatUtils.formatCurrentDateTime()}. 间隔:${duration}.${FirePS1Conf.DEFAULT}") redo(retryNum - 1, duration)(fun) } case util.Failure(e) => throw e } } redo(retryNum, duration)(fun) } /** * 尝试执行block中的逻辑,如果出现异常,则记录日志 * * @param block * try的具体逻辑 * @param logger * 日志记录器 * @param catchLog * 日志内容 * @param hook * 是否将捕获到的异常信息发送到消息队列 */ def tryWithLog(block: => Unit)(logger: Logger = this.logger, tryLog: String = tryLog, catchLog: String = catchLog, isThrow: Boolean = false, hook: Boolean = true): Unit = { try { elapsed(tryLog, logger)(block) } catch { case t: Throwable => { if (hook) ExceptionBus.offAndLogError(logger, catchLog, t) if (isThrow) throw t } } } /** * 尝试执行block中的逻辑,如果出现异常,则记录日志,并将执行结果返回 * * @param block * try的具体逻辑 * @param logger * 日志记录器 * @param catchLog * 日志内容 * @param hook * 是否将捕获到的异常信息发送到消息队列 */ def tryWithReturn[T](block: => T)(logger: Logger = this.logger, tryLog: String = tryLog, catchLog: String = catchLog, hook: Boolean = true): T = { try { elapsed[T](tryLog, logger)(block) } catch { case t: Throwable => { if (hook) ExceptionBus.offAndLogError(logger, catchLog, t) throw t } } } /** * 执行用户指定的try/catch/finally逻辑 * * @param block * try 代码块 * @param finallyBlock * finally 代码块 * @param logger * 日志记录器 * @param catchLog * 当执行try过程中发生异常时打印的日志内容 * @param finallyCatchLog * 当执行finally代码块过程中发生异常时打印的日志内容 * @param hook * 是否将捕获到的异常信息发送到消息队列 */ def tryFinallyWithReturn[T](block: => T)(finallyBlock: => Unit)(logger: Logger = this.logger, tryLog: String = tryLog, catchLog: String = catchLog, finallyCatchLog: String = finallyCatchLog, hook: Boolean = true): T = { try { elapsed[T](tryLog, logger)(block) } catch { case t: Throwable => if (hook) ExceptionBus.offAndLogError(logger, catchLog, t) throw t } finally { try { finallyBlock } catch { case t: Throwable => if (hook) ExceptionBus.offAndLogError(logger, catchLog, t) throw t } } } /** * 执行用户指定的try/catch/finally逻辑 * * @param block * try 代码块 * @param finallyBlock * finally 代码块 * @param logger * 日志记录器 * @param catchLog * 当执行try过程中发生异常时打印的日志内容 * @param finallyCatchLog * 当执行finally代码块过程中发生异常时打印的日志内容 * @param hook * 是否将捕获到的异常信息发送到消息队列 */ def tryFinally(block: => Unit)(finallyBlock: => Unit)(logger: Logger = this.logger, tryLog: String = tryLog, catchLog: String = catchLog, finallyCatchLog: String = finallyCatchLog, hook: Boolean = true): Unit = { try { elapsed[Unit](tryLog, logger)(block) } catch { case t: Throwable => if (hook) ExceptionBus.offAndLogError(logger, catchLog, t) } finally { try { finallyBlock } catch { case t: Throwable => if (hook) ExceptionBus.offAndLogError(logger, catchLog, t) } } } /** * 获取当前系统时间(ms) */ def currentTime: Long = System.currentTimeMillis /** * 以人类可读的方式计算耗时(ms) * * @param beginTime * 开始时间 * @return * 耗时 */ def elapsed(beginTime: Long): String = readable(currentTime - beginTime, TimeUnitEnum.MS) /** * 用于统计指定代码块执行的耗时时间 * * @param msg * 用于描述当前代码块的用户 * @param logger * 日志记录器 * @param threshold * 执行代码块耗时超过给定的阈值时才记录日志 * @param block * try的具体逻辑 */ def elapsed[T](msg: String, logger: Logger = this.logger, threshold: Long = 0)(block: => T): T = { val startTime = this.currentTime val retVal = block if (StringUtils.isNotBlank(msg) && (System.currentTimeMillis() - startTime) >= threshold) logger.info(s"${msg}, Elapsed:${elapsed(startTime)}") retVal } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/FireUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import com.zto.fire.common.conf.{FireFrameworkConf, FirePS1Conf} import com.zto.fire.common.enu.JobType import com.zto.fire.predef._ /** * fire框架通用的工具方法 * 注:该工具类中不可包含Spark或Flink的依赖 * * @author ChengLong * @since 1.0.0 * @create: 2020-05-17 10:17 */ private[fire] object FireUtils extends Serializable with Logging { private[fire] var isSplash = false private[fire] var _jobType: JobType = JobType.UNDEFINED private[fire] val _launchTime = System.currentTimeMillis() private[this] lazy val sparkUtils = "com.zto.fire.spark.util.SparkUtils" private[this] lazy val flinkUtils = "com.zto.fire.flink.util.FlinkUtils" /** * 获取任务启动时间 */ def launchTime: Long = this._launchTime /** * 任务运行时间 */ def uptime: Long = System.currentTimeMillis() - this.launchTime /** * 判断是否为spark引擎 */ def isSparkEngine: Boolean = "spark".equals(this.engine) /** * 判断是否为flink引擎 */ def isFlinkEngine: Boolean = "flink".equals(this.engine) /** * 获取当前实时任务所使用的计算引擎 * @return * spark / flink */ def engine: String = PropUtils.engine /** * 当前任务的引擎类型 */ def jobType: JobType = this._jobType /** * 获取fire版本号 */ def fireVersion: String = FireFrameworkConf.fireVersion /** * 获取当前执行引擎的版本号 * @return * spark-version / flink-version */ def engineVersion: String = invokeEngineUtils("getVersion") /** * 获取当前执行引擎运行时的appId */ def applicationId: String = invokeEngineUtils("getApplicationId") /** * 任务发布类型:yarn-client/yarn-cluster/run-application */ def deployMode: String = invokeEngineUtils("deployMode") /** * 反射调用不同引擎上层的工具方法 * @param methodName * spark或flink工具类的方法名 * @return */ private[this] def invokeEngineUtils(methodName: JString): String = { tryWithReturn { if (this.isSparkEngine) { val getVersionMethod = ReflectionUtils.getMethodByName(sparkUtils, methodName) getVersionMethod.invoke(null).toString } else { val getVersionMethod = ReflectionUtils.getMethodByName(flinkUtils, methodName) getVersionMethod.invoke(null).toString } } (this.logger, catchLog = s"反射调用工具类方法[$methodName]失败") } /** * 当前任务实例的主类名:packageName+className */ def mainClass: String = FireFrameworkConf.driverClassName /** * 用于在fire框架启动时展示信息 */ private[fire] def splash: Unit = { if (!isSplash) { val engineVersion = if (this.isSparkEngine) s"spark version:${this.engineVersion}" else s"flink version:${this.engineVersion}" val info = """ | ___ ___ ___ | /\ \ ___ /\ \ /\ \ | /::\ \ /\ \ /::\ \ /::\ \ | /:/\:\ \ \:\ \ /:/\:\ \ /:/\:\ \ | /::\~\:\ \ /::\__\ /::\~\:\ \ /::\~\:\ \ | /:/\:\ \:\__\ __/:/\/__/ /:/\:\ \:\__\ /:/\:\ \:\__\ | \/__\:\ \/__/ /\/:/ / \/_|::\/:/ / \:\~\:\ \/__/ | \:\__\ \::/__/ |:|::/ / \:\ \:\__\ | \/__/ \:\__\ |:|\/__/ \:\ \/__/ | \/__/ |:| | \:\__\ | \|__| \/__/ version | |""".stripMargin.replace("version", s"fire version:${FirePS1Conf.PINK + this.fireVersion + FirePS1Conf.GREEN} $engineVersion") this.logger.warn(FirePS1Conf.GREEN + info + FirePS1Conf.DEFAULT) this.isSplash = true } } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/JSONUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import com.zto.fire.predef._ import org.apache.commons.lang3.StringUtils import org.apache.htrace.fasterxml.jackson.annotation.JsonAutoDetect.Visibility import org.apache.htrace.fasterxml.jackson.annotation.JsonInclude.Include import org.apache.htrace.fasterxml.jackson.annotation.PropertyAccessor import org.apache.htrace.fasterxml.jackson.core.JsonParser import org.apache.htrace.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature} import scala.reflect.ClassTag import scala.util.Try /** * json处理工具类,基于jackson封装 * * @author ChengLong 2021年4月14日09:27:37 * @since fire 2.0.0 */ object JSONUtils { private[this] lazy val objectMapperLocal = new ThreadLocal[ObjectMapper]() { override def initialValue(): ObjectMapper = newObjectMapperWithDefaultConf } /** * 创建一个新的ObjectMapper实例 */ def newObjectMapper: ObjectMapper = new ObjectMapper /** * 创建一个新的ObjectMapper实例,并设置一系列默认的属性 */ def newObjectMapperWithDefaultConf: ObjectMapper = { this.newObjectMapper .configure(DeserializationFeature.FAIL_ON_IGNORED_PROPERTIES, false) .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) .configure(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES, true) .configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, true) .configure(JsonParser.Feature.ALLOW_NUMERIC_LEADING_ZEROS, true) .configure(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS, true) .configure(SerializationFeature.FAIL_ON_EMPTY_BEANS, false) .setSerializationInclusion(Include.ALWAYS) .setVisibility(PropertyAccessor.ALL, Visibility.ANY) } /** * 从线程局部变量中获取对应的ObjectMapper对象实例 */ def getObjectMapper: ObjectMapper = this.objectMapperLocal.get() /** * 将给定的对象解析成json字符串 * * @param obj 任意的对象实例 * @return json字符串 */ def toJSONString(obj: Object): String = this.getObjectMapper.writeValueAsString(obj) /** * 将给定的json字符串转为T类型对象实例 * * @param json * json字符串 * @tparam T * 目标泛型类型 * @return * 目标对象实例 */ def parseObject[T: ClassTag](json: String): T = this.getObjectMapper.readValue[T](json, getParamType[T]) /** * 将给定的json字符串转为T类型对象实例 * * @param json * json字符串 * @param valueType * 目标类型 * @tparam T * 目标泛型类型 * @return * 目标对象实例 */ def parseObject[T](json: String, valueType: Class[T]): T = this.getObjectMapper.readValue[T](json, valueType) /** * 用于判断给定的字符串是否为合法的json * * @param json * 待校验的字符串 * @param strictMode * 检查模式,如果是true则会进行严格的检查,会牺牲部分性能,如果为false,则只进行简单的检查,性能较好 * @return * true: 合法的字符串 false:非法的json字符串 */ def isJson(json: String, strictMode: Boolean = true): Boolean = { if (strictMode) { Try { try parseObject[JMap[Object, Object]](json) }.isSuccess } else { val jsonStr = StringUtils.trim(json) if (StringUtils.isBlank(jsonStr)) return false jsonStr.startsWith("{") && jsonStr.endsWith("}") } } /** * 用于判断给定的字符串是否为合法的jsonarray * * @param jsonArray * 待校验的字符串 * @param strictMode * 检查模式,如果是true则会进行严格的检查,会牺牲部分性能,如果为false,则只进行简单的检查,性能较好 * @return * true: 合法的字符串 false:非法的json字符串 */ def isJsonArray(jsonArray: String, strictMode: Boolean = true): Boolean = { if (strictMode) { Try { try parseObject[JList[Object]](jsonArray) }.isSuccess } else { val jsonArrayStr = StringUtils.trim(jsonArray) if (StringUtils.isBlank(jsonArrayStr)) return false jsonArrayStr.startsWith("[") && jsonArrayStr.endsWith("]") } } /** * 用于快速判断给定的字符串是否为合法的JsonArray或json * 注:不会验证每个field的合法性,仅做简单校验 * * @param json * 待校验的字符串 * @return * true: 合法的字符串 false:非法的json字符串 */ def isLegal(json: String, strictMode: Boolean = true): Boolean = this.isJson(json, strictMode) || this.isJsonArray(json, strictMode) /** * 用于快速判断给定的字符串是否为合法的JsonArray或json * 注:不会验证每个field的合法性,仅做简单校验 * * @param json * 待校验的字符串 * @return * true: 合法的字符串 false:非法的json字符串 */ def checkJson(json: String, strictMode: Boolean = true): Boolean = this.isLegal(json, strictMode) /** * 解析JSON,并获取指定key对应的值 * * @param json json字符串 * @param key json的key * @return value */ def getValue[T: ClassTag](json: String, key: String, defaultValue: T): T = { if (!this.isLegal(json)) return defaultValue val map = this.parseObject[JHashMap[String, Object]](json) map.getOrElse(key, defaultValue).asInstanceOf[T] } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/JavaTypeMap.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util /** * Java类型映射 * * @author ChengLong * @since 2.0.0 * @create 2020-12-16 15:40 */ trait JavaTypeMap { // Java API库映射 type JInt = java.lang.Integer type JLong = java.lang.Long type JBoolean = java.lang.Boolean type JChar = java.lang.Character type JFloat = java.lang.Float type JShort = java.lang.Short type JDouble = java.lang.Double type JBigDecimal = java.math.BigDecimal type JString = java.lang.String type JStringBuilder = java.lang.StringBuilder type JStringBuffer = java.lang.StringBuffer type JMap[K, V] = java.util.Map[K, V] type JHashMap[K, V] = java.util.HashMap[K, V] type JLinkedHashMap[K, V] = java.util.LinkedHashMap[K, V] type JConcurrentHashMap[K, V] = java.util.concurrent.ConcurrentHashMap[K, V] type JSet[E] = java.util.Set[E] type JHashSet[E] = java.util.HashSet[E] type JLinkedHashSet[E] = java.util.LinkedHashSet[E] type JList[E] = java.util.List[E] type JArrayList[E] = java.util.ArrayList[E] type JLinkedList[E] = java.util.LinkedList[E] type JQueue[E] = java.util.Queue[E] type JPriorityQueue[E] = java.util.PriorityQueue[E] type JCollections = java.util.Collections } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/KafkaUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import com.zto.fire.common.conf.FireKafkaConf import com.zto.fire.predef._ import org.apache.commons.lang3.StringUtils import org.apache.kafka.clients.consumer.{ConsumerConfig, KafkaConsumer, OffsetAndTimestamp} import org.apache.kafka.common.TopicPartition import org.apache.kafka.common.serialization.StringDeserializer import java.util import java.util.Properties /** * Kafka工具类 * * @author ChengLong 2020-4-17 09:50:50 */ object KafkaUtils extends Logging { private lazy val kafkaMonitor = "fire_kafka_consumer" /** * 根据kafka集群名称获取broker地址 * * @param clusterName 集群名称 * @return broker地址 */ def getBorkers(clusterName: String): String = FireKafkaConf.kafkaMap.getOrElse(clusterName, "") /** * 创建新的kafka consumer * * @param host kafka broker地址 * @param groupId 对应的groupId * @return KafkaConsumer */ def createNewConsumer(host: String, groupId: String): KafkaConsumer[String, String] = { val properties = new Properties properties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, host) properties.put(ConsumerConfig.GROUP_ID_CONFIG, groupId) properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false") properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, classOf[StringDeserializer].getName) properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, classOf[StringDeserializer].getName) properties.put("auto.offset.reset", "earliest") new KafkaConsumer[String, String](properties) } /** * 获取大于指定时间戳的一条消息 * * @param host broker地址 * @param topic topic信息 * @param timestamp 消息时间戳 * @return 一条消息记录 */ def getMsg(host: String, topic: String, timestamp: java.lang.Long): String = { var kafkaConsumer: KafkaConsumer[String, String] = null var msg = "" try { kafkaConsumer = createNewConsumer(host, kafkaMonitor) // 如果指定了时间戳,则取大于该时间戳的消息 if (timestamp != null) { // 获取topic的partition信息 val partitionInfos = kafkaConsumer.partitionsFor(topic) val topicPartitions = new util.ArrayList[TopicPartition] val timestampsToSearch = new util.HashMap[TopicPartition, java.lang.Long] for (partitionInfo <- partitionInfos) { topicPartitions.add(new TopicPartition(partitionInfo.topic, partitionInfo.partition)) timestampsToSearch.put(new TopicPartition(partitionInfo.topic, partitionInfo.partition), timestamp) } // 手动指定各分区offset kafkaConsumer.assign(topicPartitions) // 获取每个partition指定时间戳的偏移量 val map = kafkaConsumer.offsetsForTimes(timestampsToSearch) this.logger.info("根据时间戳获取偏移量:map.size={}", map.size()) var offsetTimestamp: OffsetAndTimestamp = null this.logger.info("开始设置各分区初始偏移量...") for (entry <- map.entrySet) { // 如果设置的查询偏移量的时间点大于最大的索引记录时间,那么value就为空 offsetTimestamp = entry.getValue if (offsetTimestamp != null) { // 设置读取消息的偏移量 val offset: java.lang.Long = offsetTimestamp.offset kafkaConsumer.seek(entry.getKey, offset) this.logger.info("seek: id=" + entry.getKey.partition + " offset=" + offset) } } } else { // 如果未指定时间戳,则直接获取消息 kafkaConsumer.subscribe(util.Arrays.asList(topic)) } // 消费消息 val records = kafkaConsumer.poll(10000) for (record <- records if StringUtils.isBlank(msg)) { if (timestamp == null) { msg = record.value } else { // 如果指定时间戳,则取大于指定时间戳的消息 if (record.timestamp >= timestamp) { msg = record.value } } } } catch { case e: Exception => logger.error("获取消息失败", e) } finally { if (kafkaConsumer != null) kafkaConsumer.close() } msg } /** * kafka配置信息 * * @param kafkaParams * 代码中指定的kafka配置信息,如果配置文件中也有配置,则配置文件中的优先级高 * @param groupId * 消费组 * @param offset * smallest、largest * @return * kafka相关配置 */ def kafkaParams(kafkaParams: Map[String, Object] = null, groupId: String = null, kafkaBrokers: String = null, offset: String = FireKafkaConf.offsetLargest, autoCommit: Boolean = false, keyNum: Int = 1): Map[String, Object] = { val consumerMap = collection.mutable.Map[String, Object]() // 代码中指定的kafka配置优先级最低 if (kafkaParams != null && kafkaParams.nonEmpty) consumerMap ++= kafkaParams // 如果没有在配置文件中指定brokers,则认为从代码中获取,此处返回空的map,用于上层判断 val confBrokers = FireKafkaConf.kafkaBrokers(keyNum) val finalKafkaBrokers = if (StringUtils.isNotBlank(confBrokers)) confBrokers else kafkaBrokers if (StringUtils.isNotBlank(finalKafkaBrokers)) consumerMap += ("bootstrap.servers" -> finalKafkaBrokers) // 如果配置文件中没有指定spark.kafka.group.id,则默认获取用户指定的groupId val confGroupId = FireKafkaConf.kafkaGroupId(keyNum) val finalKafkaGroupId = if (StringUtils.isNotBlank(confGroupId)) confGroupId else groupId if (StringUtils.isNotBlank(finalKafkaGroupId)) consumerMap += ("group.id" -> finalKafkaGroupId) val confOffset = FireKafkaConf.kafkaStartingOffset(keyNum) val finalOffset = if (StringUtils.isNotBlank(confOffset)) confOffset else offset if (StringUtils.isNotBlank(finalOffset)) consumerMap += ("auto.offset.reset" -> finalOffset) val confAutoCommit = FireKafkaConf.kafkaEnableAutoCommit(keyNum) val finalAutoCommit = if (confAutoCommit != null) confAutoCommit else autoCommit if (finalAutoCommit != null) consumerMap += ("enable.auto.commit" -> (finalAutoCommit: java.lang.Boolean)) // 最基本的配置项 consumerMap ++= collection.mutable.Map[String, Object]( "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "session.timeout.ms" -> FireKafkaConf.kafkaSessionTimeOut(keyNum), "request.timeout.ms" -> FireKafkaConf.kafkaRequestTimeOut(keyNum), "max.poll.interval.ms" -> FireKafkaConf.kafkaPollInterval(keyNum) ) // 以spark.kafka.conf.开头的配置优先级最高 val configMap = FireKafkaConf.kafkaConfMapWithType(keyNum) if (configMap.nonEmpty) consumerMap ++= configMap // 日志记录最终生效的kafka配置 LogUtils.logMap(this.logger, consumerMap.toMap, s"Kafka client configuration. keyNum=$keyNum.") consumerMap.toMap } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/LineageManager.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import com.zto.fire.common.bean.lineage.Lineage import com.zto.fire.common.conf.FireFrameworkConf._ import com.zto.fire.common.enu.{Datasource, Operation, ThreadPoolType} import com.zto.fire.predef._ import org.apache.commons.lang3.StringUtils import java.util.Objects import java.util.concurrent._ import scala.collection.{JavaConversions, mutable} /** * 用于统计当前任务使用到的数据源信息,包括MQ、DB、hive等连接信息等 * * @author ChengLong * @since 2.0.0 * @create 2020-11-26 15:30 */ private[fire] class LineageManager extends Logging { // 用于存放当前任务用到的数据源信息 private[fire] lazy val lineageMap = new ConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]]() private[fire] lazy val tableMetaSet = new CopyOnWriteArraySet[TableMeta]() // 用于收集来自不同数据源的sql语句,后续会异步进行SQL解析,考虑到分布式场景下会有很多重复的SQL执行,因此使用了线程不安全的队列即可满足需求 private lazy val dbSqlQueue = new ConcurrentLinkedQueue[DBSqlSource]() // 用于解析数据源的异步定时调度线程 private lazy val parserExecutor = ThreadUtils.createThreadPool("LineageManager", ThreadPoolType.SCHEDULED).asInstanceOf[ScheduledExecutorService] private var parseCount = 0 // 用于收集各实时引擎执行的sql语句 this.lineageParse() /** * 用于异步解析sql中使用到的表,并放到linageMap中 */ private[this] def lineageParse(): Unit = { if (lineageEnable) { this.parserExecutor.scheduleWithFixedDelay(new Runnable { override def run(): Unit = { parseCount += 1 if (parseCount >= lineageRunCount && !parserExecutor.isShutdown) { logger.info(s"4. 异步解析实时血缘的定时任务采样共计:${lineageRunCount}次,即将退出异步线程") parserExecutor.shutdown() } // 1. 解析jdbc sql语句 val start = currentTime tryWithLog { for (_ <- 1 until dbSqlQueue.size()) { val sqlSource = dbSqlQueue.poll() if (sqlSource != null) { val tableNames = SQLUtils.tableParse(sqlSource.sql) if (tableNames != null && tableNames.nonEmpty) { tableNames.filter(StringUtils.isNotBlank).foreach(tableName => { add(Datasource.parse(sqlSource.datasource), DBDatasource(sqlSource.datasource, sqlSource.cluster, tableName, sqlSource.username, operation = sqlSource.operation)) }) } } } } (logger, s"1. 开始第${parseCount}/${lineageRunCount}次解析JDBC中的血缘信息", "jdbc血缘信息解析失败") // 2. 将解析好的引擎SQL血缘按Datasource进行分类 tryWithLog { tableMetaSet.foreach(tableMeta => { val prop = tableMeta.properties val operationSet = Set(tableMeta.operation) tableMeta.datasource match { case Datasource.KAFKA => { val dataSource = MQDatasource(Datasource.KAFKA.toString, prop.getOrDefault("properties.bootstrap.servers", ""), prop.getOrDefault("topic", ""), prop.getOrDefault("properties.group.id", ""), operationSet) add(Datasource.KAFKA, dataSource) } case Datasource.FIRE_ROCKETMQ => { val datasource = MQDatasource(Datasource.ROCKETMQ.toString, PropUtils.getString(prop.getOrDefault("rocket.brokers.name", "")), prop.getOrDefault("rocket.topics", ""), prop.getOrDefault("rocket.group.id", ""), operationSet) add(Datasource.FIRE_ROCKETMQ, datasource) } case Datasource.JDBC => { val driver = prop.getOrDefault("driver", "") val url = prop.getOrDefault("url", "") val user = prop.getOrDefault("username", "") val datasource = DBDatasourceDetail(Datasource.JDBC.toString, url, tableMeta.tableName, user, operationSet) add(Datasource.JDBC, datasource) } case _ => add(tableMeta.datasource, tableMeta) } }) } (logger, s"2. 开始第${parseCount}/${lineageRunCount}次解析SQL中的血缘关系", "sql血缘关系解析失败") logger.info(s"3. 完成第${parseCount}/${lineageRunCount}次异步解析SQL埋点中的表信息,耗时:${elapsed(start)}") } }, lineageRunInitialDelay, lineageRunPeriod, TimeUnit.SECONDS) } } /** * 添加一个数据源描述信息 */ private[fire] def add(sourceType: Datasource, datasourceDesc: DatasourceDesc): Unit = { if (!lineageEnable) return val set = this.lineageMap.mergeGet(sourceType)(new JHashSet[DatasourceDesc]()) if (set.isEmpty) set.add(datasourceDesc) val mergedSet = this.mergeDatasource(set, datasourceDesc) this.lineageMap.put(sourceType, mergedSet) } /** * merge相同数据源的对象 */ private[fire] def mergeDatasource(datasourceList: JHashSet[DatasourceDesc], datasourceDesc: DatasourceDesc): JHashSet[DatasourceDesc] = { val mergeSet = new CopyOnWriteArraySet[DatasourceDesc](datasourceList) mergeSet.foreach { case ds: DBDatasource => { if (datasourceDesc.isInstanceOf[DBDatasource]) { val target = datasourceDesc.asInstanceOf[DBDatasource] if (ds.equals(target)) { ds.operation.addAll(target.operation) } else { mergeSet.add(datasourceDesc) } } } case ds: DBDatasourceDetail => { if (datasourceDesc.isInstanceOf[DBDatasourceDetail]) { val target = datasourceDesc.asInstanceOf[DBDatasourceDetail] if (ds.equals(target)) { ds.operation.addAll(target.operation) } else { mergeSet.add(datasourceDesc) } } } case ds: DBSqlSource => { if (datasourceDesc.isInstanceOf[DBSqlSource]) { val target = datasourceDesc.asInstanceOf[DBSqlSource] if (ds.equals(target)) { ds.operation.addAll(target.operation) } else { mergeSet.add(datasourceDesc) } } } case ds: MQDatasource => { if (datasourceDesc.isInstanceOf[MQDatasource]) { val target = datasourceDesc.asInstanceOf[MQDatasource] if (ds.equals(target)) { ds.operation.addAll(target.operation) } else { mergeSet.add(datasourceDesc) } } } case _ => } new JHashSet[DatasourceDesc](mergeSet) } /** * 向队列中添加一条sql类型的数据源,用于后续异步解析 */ private[fire] def addDBDataSource(source: DBSqlSource): Unit = if (lineageEnable && this.dbSqlQueue.size() <= lineMaxSize) this.dbSqlQueue.offer(source) /** * 收集执行的sql语句 */ private[fire] def addTableMeta(tableMetaSet: JSet[TableMeta]): Unit = if (lineageEnable) this.tableMetaSet.addAll(tableMetaSet) /** * 获取所有使用到的数据源 */ private[fire] def get: JConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]] = this.lineageMap } /** * 对外暴露API,用于收集并处理各种埋点信息 */ private[fire] object LineageManager extends Logging { private[fire] lazy val manager = new LineageManager /** * 添加一条sql记录到队列中 * * @param datasource * 数据源类型 * @param cluster * 集群信息 * @param username * 用户名 * @param sql * 待解析的sql语句 */ private[fire] def addDBSql(datasource: String, cluster: String, username: String, sql: String, operation: Operation*): Unit = { this.manager.addDBDataSource(DBSqlSource(datasource, cluster, username, sql, toOperationSet(operation: _*))) } /** * 添加解析后的TableMeta到队列中 * * @param tableMeta 待解析的sql语句 */ def addTableMeta(tableMeta: JSet[TableMeta]): Unit = LineageManager.manager.addTableMeta(tableMeta) /** * 添加一条DB的埋点信息 * * @param datasource * 数据源类型 * @param cluster * 集群信息 * @param tableName * 表名 * @param username * 连接用户名 */ private[fire] def addDBDatasource(datasource: String, cluster: String, tableName: String, username: String = "", operation: Operation): Unit = { this.manager.add(Datasource.parse(datasource), DBDatasource(datasource, cluster, tableName, username, toOperationSet(operation))) } /** * 添加多个数据源操作 */ private[fire] def toOperationSet(operation: Operation*): JHashSet[Operation] = { val operationSet = new JHashSet[Operation] operation.foreach(operationSet.add) operationSet } /** * 添加一条MQ的埋点信息 * * @param datasource * 数据源类型 * @param cluster * 集群标识 * @param topics * 主题列表 * @param groupId * 消费组标识 */ private[fire] def addMQDatasource(datasource: String, cluster: String, topics: String, groupId: String, operation: Operation*): Unit = { this.manager.add(Datasource.parse(datasource), MQDatasource(datasource, cluster, topics, groupId, toOperationSet(operation: _*))) } /** * 获取所有使用到的数据源 */ private[fire] def getDatasourceLineage: JConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]] = this.manager.get /** * 获取完整的实时血缘信息 */ private[fire] def getLineage: Lineage = { new Lineage(this.getDatasourceLineage, SQLLineageManager.getSQLLineage) } /** * 合并两个血缘map * @param current * 待合并的map * @param target * 目标map * @return * 合并后的血缘map */ def mergeLineageMap(current: JConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]], target: JConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]]): JConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]] = { target.foreach(ds => { val datasourceDesc = current.mergeGet(ds._1)(ds._2) if (ds._2.nonEmpty) { ds._2.foreach(desc => { current.put(ds._1, this.manager.mergeDatasource(datasourceDesc, desc)) }) } }) current } } /** * 数据源描述 */ trait DatasourceDesc /** * 面向数据库类型的数据源,带有tableName * * @param datasource * 数据源类型,参考DataSource枚举 * @param cluster * 数据源的集群标识 * @param tableName * 表名 * @param username * 使用关系型数据库时作为jdbc的用户名,HBase留空 * @param operation 数据源操作类型 */ case class DBDatasource(datasource: String, cluster: String, tableName: String, username: String = "", operation: JSet[Operation] = new JHashSet[Operation]) extends DatasourceDesc { override def equals(obj: Any): Boolean = { if (obj == null || getClass != obj.getClass) return false val target = obj.asInstanceOf[DBDatasource] Objects.equals(datasource, target.datasource) && Objects.equals(cluster, target.cluster) && Objects.equals(tableName, target.tableName) && Objects.equals(username, target.username) } override def hashCode(): Int = Objects.hash(datasource, cluster, tableName, username) } /** * @param operation 针对表的具体操作类型 */ case class DBDatasourceDetail(datasource: String, cluster: String, tableName: String, username: String = "", operation: JSet[Operation] = new JHashSet[Operation]) extends DatasourceDesc { override def equals(obj: Any): Boolean = { if (obj == null || getClass != obj.getClass) return false val target = obj.asInstanceOf[DBDatasourceDetail] Objects.equals(datasource, target.datasource) && Objects.equals(cluster, target.cluster) && Objects.equals(username, target.username) } override def hashCode(): Int = Objects.hash(datasource, cluster, tableName, username) } /** * 面向数据库类型的数据源,需将SQL中的tableName主动解析 * * @param datasource * 数据源类型,参考DataSource枚举 * @param cluster * 数据源的集群标识 * @param username * 使用关系型数据库时作为jdbc的用户名,HBase留空 * @param sql 执行的SQL语句 * @param operation 数据源操作类型 */ case class DBSqlSource(datasource: String, cluster: String, username: String, sql: String, operation: JSet[Operation] = new JHashSet[Operation]) extends DatasourceDesc { override def equals(obj: Any): Boolean = { if (obj == null || getClass != obj.getClass) return false val target = obj.asInstanceOf[DBSqlSource] Objects.equals(datasource, target.datasource) && Objects.equals(cluster, target.cluster) && Objects.equals(username, target.username) && Objects.equals(sql, target.sql) } override def hashCode(): Int = Objects.hash(datasource, cluster, username, sql) } /** * MQ类型数据源,如:kafka、RocketMQ等 * * @param datasource * 数据源类型,参考DataSource枚举 * @param cluster * 数据源的集群标识 * @param operation * 数据源操作类型 * @param topics * 使用到的topic列表 * @param groupId * 任务的groupId */ case class MQDatasource(datasource: String, cluster: String, topics: String, groupId: String, operation: JSet[Operation] = new JHashSet[Operation]) extends DatasourceDesc { override def equals(obj: Any): Boolean = { if (obj == null || getClass != obj.getClass) return false val target = obj.asInstanceOf[MQDatasource] Objects.equals(datasource, target.datasource) && Objects.equals(cluster, target.cluster) && Objects.equals(topics, target.topics) && Objects.equals(groupId, target.groupId) } override def hashCode(): Int = Objects.hash(datasource, cluster, topics, groupId) } /** * sql解析后的库表信息包装类 * * @param dbName 数据库名称 * @param tableName 表名 * @param partition 分区信息 * @param datasource 所属的catalog(default、hive等) * @param operation 针对该表的操作类型:SELECT、INSERT、DROP等 * @param properties 标的属性,如with列表属性等 */ case class TableMeta(dbName: String = "", tableName: String = "", partition: mutable.Map[String, String] = mutable.Map.empty, var datasource: Datasource = Datasource.VIEW, operation: Operation = Operation.SELECT, properties: mutable.Map[String, String] = mutable.Map.empty) extends DatasourceDesc ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/LogUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import com.zto.fire.common.conf.FirePS1Conf import org.apache.commons.lang3.StringUtils import org.slf4j.Logger import org.slf4j.event.Level /** * 日志工具类 * * @author ChengLong * @since 1.0.0 * @create 2020-07-01 10:23 */ object LogUtils extends Logging { /** * 以固定的开始与结束风格打日志 * * @param logger * 日志记录器 * @param title * 日志开始标题 * @param style * 日志开始标题类型 * @param level * 日志的级别 * @param fun * 用户自定义的操作 */ def logStyle(logger: Logger, title: String = "", style: String = "-", level: Level = Level.INFO)(fun: Logger => Unit): Unit = { if (logger != null) { val styleRepeat = StringUtils.repeat(style, 19) val titleStart = styleRepeat + s"${FirePS1Conf.GREEN}> start: " + title + s" <${FirePS1Conf.DEFAULT}" + styleRepeat this.logLevel(logger, titleStart, level) fun(logger) val titleEnd = styleRepeat + s"${FirePS1Conf.GREEN}> end: " + title + s" <${FirePS1Conf.DEFAULT}" + styleRepeat this.logLevel(logger, titleEnd, level) } } /** * 以固定的风格打印map中的内容 */ def logMap(logger: Logger = this.logger, map: Map[_, _], title: String): Unit = { if (logger != null && map != null && map.nonEmpty) { LogUtils.logStyle(logger, title)(logger => { map.foreach(kv => logger.info(s"---> ${kv._1} = ${kv._2}")) }) } } /** * 根据指定的基本进行日志记录 * * @param logger * 日志记录器 * @param log * 日志内容 * @param level * 日志的级别 */ def logLevel(logger: Logger, log: String, level: Level = Level.INFO, ps: String = null): Unit = { val logMsg = if (StringUtils.isNotBlank(ps)) s"$ps $log ${FirePS1Conf.DEFAULT}" else log level match { case Level.DEBUG => logger.debug(logMsg) case Level.INFO => logger.info(logMsg) case Level.WARN => logger.warn(logMsg) case Level.ERROR => logger.error(logMsg) case Level.TRACE => logger.trace(logMsg) } } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/Logging.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import org.slf4j.{Logger, LoggerFactory} /** * 日志记录器 * * @author ChengLong 2021-11-2 15:41:30 * @since 2.2.0 */ trait Logging { private lazy val log_ = LoggerFactory.getLogger(this.getClass) /** * 获取日志对象 */ protected def logger: Logger = log_ protected def logInfo(msg: => String): Unit = { if (this.logger.isInfoEnabled) this.logger.info(msg) } protected def logDebug(msg: => String): Unit = { if (this.logger.isDebugEnabled) this.logger.debug(msg) } protected def logTrace(msg: => String): Unit = { if (this.logger.isTraceEnabled) this.logger.trace(msg) } protected def logWarning(msg: => String): Unit = { if (this.logger.isWarnEnabled) this.logger.warn(msg) } protected def logError(msg: => String): Unit = { if (this.logger.isErrorEnabled) this.logger.error(msg) } protected def logInfo(msg: => String, throwable: Throwable): Unit = { if (this.logger.isInfoEnabled) this.logger.info(msg, throwable) } protected def logDebug(msg: => String, throwable: Throwable): Unit = { if (this.logger.isDebugEnabled) this.logger.debug(msg, throwable) } protected def logTrace(msg: => String, throwable: Throwable): Unit = { if (this.logger.isTraceEnabled) this.logger.trace(msg, throwable) } protected def logWarning(msg: => String, throwable: Throwable): Unit = { if (this.logger.isWarnEnabled) this.logger.warn(msg, throwable) } protected def logError(msg: => String, throwable: Throwable): Unit = { if (this.logger.isErrorEnabled) this.logger.error(msg, throwable) } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/MQProducer.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import com.zto.fire.common.anno.Internal import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.enu.JobType import com.zto.fire.predef._ import com.zto.fire.common.util.MQType.MQType import com.zto.fire.common.util.ShutdownHookManager.DEFAULT_PRIORITY import org.apache.kafka.clients.producer.{Callback, KafkaProducer, ProducerConfig, ProducerRecord, RecordMetadata} import org.apache.kafka.common.serialization.StringSerializer import org.apache.rocketmq.client.producer.{DefaultMQProducer, SendCallback, SendResult} import org.apache.rocketmq.common.message.Message import org.apache.rocketmq.remoting.common.RemotingHelper import java.util.Properties import java.util.concurrent.atomic.AtomicBoolean /** * 消息队列管理器:内置常用MQ的发送API,消息的key与value默认均为String类型 * 注:考虑到spark和flink在实时场景下不需要额外的api消费mq的场景,故暂不提供消费api * * @author ChengLong 2022-07-29 10:02:48 * @since 2.3.1 */ private[fire] class MQProducer(url: String, mqType: MQType = MQType.kafka, otherConf: Map[String, String] = Map.empty) extends Logging { private lazy val maxRetries = FireFrameworkConf.exceptionTraceSendMQMaxRetries private lazy val sendTimeout = FireFrameworkConf.exceptionSendTimeout private var sendErrorCount = 0 private lazy val isRelease = new AtomicBoolean(false) private var useKafka, useRocketmq = false // kafka producer private lazy val kafkaProducer = { val props = new Properties() props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, this.url) props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer]) props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer]) props.put(ProducerConfig.MAX_BLOCK_MS_CONFIG, this.sendTimeout.toString) this.otherConf.foreach(prop => props.put(prop._1, prop._2)) val producer = new KafkaProducer[String, String](props) this.useKafka = true producer } // rocketmq producer private lazy val rocketmqProducer = { val producer = new DefaultMQProducer("fire") producer.setNamesrvAddr(this.url) producer.setSendMsgTimeout(this.sendTimeout) producer.start() this.useRocketmq = true producer } /** * 释放producer资源 */ private[fire] def close: Unit = { if (this.isRelease.compareAndSet(false, true)) { if (this.useKafka) { this.kafkaProducer.flush() this.kafkaProducer.close() } if (this.useRocketmq) { this.rocketmqProducer.shutdown() } } } /** * 发送消息到kafka * * @param topic * 主题名称 * @param msg * 发送的消息 */ def sendKafka(topic: String, msg: String): Unit = { requireNonEmpty(topic, "topic不能为空") if (this.sendErrorCount >= this.maxRetries) { this.kafkaProducer.close() logger.error(s"异常信息发送MQ重试${this.sendErrorCount}次仍失败,将退出异常信息发送!") return } val record = new ProducerRecord[String, String](topic, msg) kafkaProducer.send(record, new Callback() { override def onCompletion(recordMetadata: RecordMetadata, exception: Exception): Unit = { if (exception != null) { sendErrorCount += 1 logger.warn("Send msg to kafka failed!", exception) } else sendErrorCount = 0 } }) } /** * 发送消息到rocketmq * * @param topic * 主题名称 * @param msg * 消息体 * @param tags * tag * @param timeout * 发送超时时间 */ def sendRocketmq(topic: String, msg: String, tags: String = "*", timeout: Long = 10000): Unit = { requireNonEmpty(topic, "topic不能为空") if (this.sendErrorCount >= this.maxRetries) { this.rocketmqProducer.shutdown() return } val record = new Message(topic, tags, msg.getBytes(RemotingHelper.DEFAULT_CHARSET)) this.rocketmqProducer.send(record, new SendCallback { override def onSuccess(sendResult: SendResult): Unit = { // do nothing } override def onException(exception: Throwable): Unit = { if (exception != null) { sendErrorCount += 1 logger.warn("Send msg to rocketmq failed!", exception) } else sendErrorCount = 0 } }, timeout) } /** * 发送消息到指定的消息队列 * * @param topic * 主题名称 * @param msg * 消息体 */ def send(topic: String, msg: String): Unit = { this.mqType match { case MQType.rocketmq => this.sendRocketmq(topic, msg) case _ => this.sendKafka(topic, msg) } } } object MQProducer { // 用于维护多个producer实例,避免重复创建 private lazy val kafkaProducerMap = new JConcurrentHashMap[String, MQProducer]() this.addHook() (this.release) /** * 释放所有使用了的producer资源,会被fire框架自动调用 */ @Internal private[fire] def release: Unit = { kafkaProducerMap.foreach(t => t._2.close) } /** * 注册jvm退出前回调,在任务退出前完成消息的发出 * @param fun * 消息发送逻辑 */ private[fire] def addHook(priority: Int = ShutdownHookManager.LOW_PRIORITY)(fun: => Unit): Unit = { // 注册回调,在jvm退出前将所有异常发送到mq中 ShutdownHookManager.addShutdownHook(priority) (() => { fun }) } def apply(url: String, mqType: MQType = MQType.kafka, otherConf: Map[String, String] = Map.empty) = new MQProducer(url, mqType, otherConf) /** * 发送消息到指定的mq topic * @param mqType * mq的类别:kafka/rocketmq * @param otherConf * 优化参数 */ def send(url: String, topic: String, msg: String, mqType: MQType = MQType.kafka, otherConf: Map[String, String] = Map.empty): Unit = { val producer = this.kafkaProducerMap.mergeGet(url + ":" + topic)(new MQProducer(url, mqType, otherConf)) producer.send(topic, msg) } /** * 将消息方式到kafka */ def sendKafka(url: String, topic: String, msg: String, otherConf: Map[String, String] = Map.empty): Unit = this.send(url, topic, msg, MQType.kafka, otherConf) /** * 将消息发送到rocketmq */ def sendRocketMQ(url: String, topic: String, msg: String, otherConf: Map[String, String] = Map.empty): Unit = this.send(url, topic, msg, MQType.rocketmq, otherConf) } /** * 主流MQ产品枚举类 */ object MQType extends Enumeration { type MQType = Value val kafka = Value("kafka") val rocketmq = Value("rocketmq") } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/NumberFormatUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import java.math.BigDecimal /** * 数值类型常用操作工具类 * Created by ChengLong on 2018-06-01. */ object NumberFormatUtils { /** * floor操作 * * @param field * @return */ def floor(field: Double): Int = { if (field == null) 0 else Math.floor(field).toInt } /** * 将Long转为Integer * * @param field * @return */ def long2Int(field: java.lang.Long): java.lang.Integer = { if (field != null) { field.toInt } else { 0 } } /** * 将BigDecimal转为Long类型 * * @param field * @return */ def bigDecimal2Long(field: java.math.BigDecimal): java.lang.Long = { if (field != null) { field.longValue() } else { 0L } } /** * 判断是否为空 * * @param decimal * @return */ def ifnull(decimal: java.math.BigDecimal, defaultVal: java.math.BigDecimal): java.math.BigDecimal = { if (decimal == null) defaultVal else decimal } /** * 类似于round,但不会四舍五入 * * @param value * 目标值 * @param scale * 精度 * @return */ def truncate(value: Double, scale: Int): Double = { if (value == null) { 0.0 } else { new BigDecimal(value).setScale(Math.abs(scale), BigDecimal.ROUND_HALF_UP).doubleValue() } } def truncate2(value: Double, scale: Int): Double = { if (value == null) { 0.0 } else if (scale == 0) { value.toLong } else { val tmp = Math.pow(10, Math.abs(scale)) (value * tmp).asInstanceOf[Int] / tmp } } /** * 截取精度 * * @param bigDecimal * @param scale * 精度 * @return */ def truncateDecimal(bigDecimal: java.math.BigDecimal, scale: Int): java.math.BigDecimal = { if (bigDecimal == null) { new java.math.BigDecimal("0").setScale(scale, BigDecimal.ROUND_HALF_UP) } else { bigDecimal.setScale(scale, BigDecimal.ROUND_HALF_UP) } } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/PropUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import com.zto.fire.common.anno.{Config, Internal} import com.zto.fire.common.conf._ import com.zto.fire.common.enu.ConfigureLevel import com.zto.fire.predef._ import org.apache.commons.lang3.StringUtils import org.slf4j.LoggerFactory import java.io.{FileInputStream, InputStream, StringReader} import java.util.Properties import java.util.concurrent.atomic.AtomicBoolean import scala.collection.mutable.{ArrayBuffer, Map} import scala.collection.{immutable, mutable} import scala.reflect.{ClassTag, classTag} /** * 读取配置文件工具类 * Created by ChengLong on 2016-11-22. */ object PropUtils extends Logging { private val props = new Properties() private val configurationFiles = Array[String]("fire", "cluster", "spark", "flink") // 用于判断是否merge过 private[fire] val isMerge = new AtomicBoolean(false) // 引擎类型判断,当前阶段仅支持spark与flink,未来若支持新的引擎,则需在此处做支持 private[fire] val engine = if (this.isExists("spark")) "spark" else "flink" // 加载默认配置文件 this.load(this.configurationFiles: _*) // 避免已被加载的配置文件被重复加载 private[this] lazy val alreadyLoadMap = new mutable.HashMap[String, String]() // 用于存放自适应引擎前缀的配置信息 private[fire] lazy val adaptiveSettingsMap = new mutable.HashMap[String, String]() // 用于存放原始的配置信息 private[fire] lazy val originalSettingsMap = new mutable.HashMap[String, String]() // 用于存放固定前缀,而后缀不同的配置信息 private[this] lazy val cachedConfMap = new mutable.HashMap[String, collection.immutable.Map[String, String]]() /** * 判断指定的配置文件是否存在 * * @param fileName * 配置文件名称 */ def isExists(fileName: String): Boolean = { var resource: InputStream = null try { resource = this.getInputStream(fileName) if (resource == null) false else true } finally { if (resource != null) { IOUtils.close(resource) } } } /** * 获取配置信息 */ def apply(key: String, keyNum: Int = 1): String = this.getString(key, "", keyNum = keyNum) /** * 获取完整的配置文件名称 */ private[this] def getFullName(fileName: String): String = if (fileName.endsWith(".properties")) fileName else s"$fileName.properties" /** * 获取指定配置文件的输入流 * 注:此api调用者需主动关闭输入流 * * @param fileName * 配置文件名称 */ private[this] def getInputStream(fileName: String): InputStream = { val fullName = this.getFullName(fileName) var resource: InputStream = null try { resource = FileUtils.resourceFileExists(fullName) if (resource == null) { val findFileName = FindClassUtils.findFileInJar(fullName) if (StringUtils.isNotBlank(findFileName)) { if (FindClassUtils.isJar) { resource = FileUtils.resourceFileExists(findFileName) } else { resource = new FileInputStream(findFileName) } } } resource } } /** * 加载指定配置文件,resources根目录下优先级最高,其次是按字典顺序的目录 * * @param fileName * 配置文件名称 */ def loadFile(fileName: String): this.type = this.synchronized { val fullName = this.getFullName(fileName) if (StringUtils.isNotBlank(fullName) && !this.alreadyLoadMap.contains(fullName)) { var resource: InputStream = null try { resource = this.getInputStream(fullName) if (resource == null && !this.configurationFiles.contains(fileName)) this.logger.warn(s"未找到配置文件[ $fullName ],请核实!") if (resource != null) { this.logger.warn(s"${FirePS1Conf.YELLOW} -------------> loaded ${fullName} <------------- ${FirePS1Conf.DEFAULT}") props.load(resource) // 将所有的配置信息存放到settings中,并统一添加key的引擎前缀,如: // 如果是spark引擎,则key前缀统一添加spark. 如果是flink引擎,则统一添加flink. props.foreach(prop => { this.adaptiveSettingsMap.put(this.adaptiveKey(prop._1), prop._2) this.originalSettingsMap.put(prop._1, prop._2) }) props.clear() this.alreadyLoadMap.put(fullName, fullName) } } finally { if (resource != null) { IOUtils.close(resource) } } } this } /** * 加载多个指定配置文件,resources根目录下优先级最高,其次是按字典顺序的目录 * * @param fileNames * 配置文件名称 */ def load(fileNames: String*): this.type = { if (noEmpty(fileNames)) fileNames.foreach(this.loadFile) this } /** * 加载扩展的注解配置信息: * @Kafka、@RocketMQ、@Hive、@HBase等 * * @param clazz * 任务入口类 */ def loadAnnoConf(clazz: Class[_]): this.type = { if (!FireFrameworkConf.annoConfEnable) return this if (clazz == null) return this // 加载通过@Config注解配置的信息 val option = this.getAnnoConfig(clazz) if (option.nonEmpty) { val (files, props, value) = option.get // 解析通过注解配置的多个配置信息 this.parseTextConfig(value).foreach(kv => this.setProperty(kv._1, kv._2)) // 解析通过注解配置的单项配置信息 props.foreach(kv => this.setProperty(kv._1, kv._2)) if (noEmpty(files)) this.load(files: _*) } // 加载其他注解指定的配置信息 val annoManagerClass = FireFrameworkConf.annoManagerClass if (isEmpty(annoManagerClass)) throw new IllegalArgumentException(s"未找到注解管理器,请通过:${FireFrameworkConf.FIRE_CONF_ANNO_MANAGER_CLASS}进行配置!") tryWithLog { val annoClazz = Class.forName(annoManagerClass) val method = ReflectionUtils.getMethodByName(annoClazz, "getAnnoProps") if (isEmpty(method)) throw new RuntimeException(s"未找到getAnnoProps()方法,通过${FireFrameworkConf.FIRE_CONF_ANNO_MANAGER_CLASS}指定的类必须是com.zto.fire.core.conf.AnnoManager的子类") val annoProps = method.invoke(annoClazz.newInstance(), clazz) this.setProperties(annoProps.asInstanceOf[mutable.HashMap[String, String]]) } (this.logger, "成功加载注解中的配置信息!", "注解配置信息加载失败!") this } /** * 将多行字符串文本解析成key value的形式 * @param value * 配置信息,支持井号注释与多行配置 */ private[fire] def parseTextConfig(value: String): Map[String, String] = { val mapConfig = new JHashMap[String, String]() if (noEmpty(value)) { // 移除所有的注释信息 val normalValue = RegularUtils.propAnnotation.replaceAllIn(value, "").replaceAll("\\|", "").trim val valueProps = new Properties() val stringReader = new StringReader(normalValue) valueProps.load(stringReader) stringReader.close() val propMap = valueProps.map(kv => (StringUtils.trim(kv._1), StringUtils.trim(kv._2))).filter(kv => noEmpty(kv, kv._1, kv._2)).toMap mapConfig.putAll(propMap) } mapConfig } /** * 加载注解配置信息 * * @param clazz * 任务入口类 */ def loadJobConf(clazz: Class[_]): this.type = { if (clazz == null) return this this.load(clazz.getSimpleName.replace("$", "")) this } /** * 获取配置中心配置信息并加载用户配置以及注解配置 * 配置的优先级:fire公共配置 < 配置中心公共配置 < 用户任务配置 < 配置中心任务级别配置 < 配置中心紧急配置 * * @param className * 入口类的包名+类名 */ def loadJobConf(className: String): this.type = { // 通过接口调用获取配置中心配置各等级的参数信息 val centerConfig = this.invokeConfigCenter(className) // 配置中心的默认配置优先级高于框架(fire.properties)以及引擎(spark.properties/flink.properties)等配置 this.setProperties(centerConfig.getOrDefault(ConfigureLevel.FRAMEWORK, Map.empty[String, String])) // 加载扩展类注解配置(@Kafka、@RocketMQ、@Hive、@HBase等) this.loadAnnoConf(Class.forName(className)) // 加载用户配置文件以及@Config注解配置 this.loadJobConf(Class.forName(className)) // 配置中心任务级别配置优先级高于用户本地配置文件中的配置,做到重启任务即可生效 this.setProperties(centerConfig.getOrDefault(ConfigureLevel.TASK, Map.empty[String, String])) // 配置中心紧急配置优先级最高,用于对所有任务生效的紧急参数调优 this.setProperties(centerConfig.getOrDefault(ConfigureLevel.URGENT, Map.empty[String, String])) this } /** * 自适应key的前缀 */ private[this] def adaptiveKey(key: String): String = { if (!key.startsWith(s"${this.engine}.")) s"${this.engine}.$key" else key } /** * 根据key获取配置信息 * 注:其他均需要通过该API进行配置的获取,禁止直接调用:props.getProperty * * @param key * 配置的key * @return * 配置的value */ def getProperty(key: String): String = { if (this.isMerge.compareAndSet(false, true)) this.mergeEngineConf this.getOriginalProperty(this.adaptiveKey(key)) } /** * 获取原生的配置信息 */ private[fire] def getOriginalProperty(key: String): String = this.adaptiveSettingsMap.getOrElse(key, "") /** * 将给定的配置中的值与计量单位拆分开 * * @param value * 配置的值,形如:10.3min * @return * 拆分单位后的tuple,形如:(10.3, min) */ def splitUnit(value: String): (String, String) = { val numericPrefix = RegularUtils.numericPrefix.findFirstIn(value) val unitSuffix = RegularUtils.unitSuffix.findFirstIn(value) if (numericPrefix.isEmpty || unitSuffix.isEmpty) throw new IllegalArgumentException("配置中不包含数值或计量单位,请检查配置") (numericPrefix.get.trim, unitSuffix.get.trim) } /** * 获取字符串 */ def getString(key: String): String = this.getProperty(key) /** * 获取字符串,为空则取默认值 */ def getString(key: String, default: String): String = { val value = this.getProperty(key) if (StringUtils.isNotBlank(value)) value else default } /** * 获取拼接后数值的配置字符串 * * @param key 配置的前缀 * @param keyNum 拼接到key后的数值后缀 * @return * 对应的配置信息 */ def getString(key: String, default: String, keyNum: Int = 1): String = { if (keyNum <= 1) { var value = this.getProperty(key) if (StringUtils.isBlank(value)) { value = this.getString(key + "1", default) } value } else { this.getString(key + keyNum, default) } } /** * 获取拼接后数值的配置整数 * * @param key 配置的前缀 * @param keyNum 拼接到key后的数值后缀 * @return * 对应的配置信息 */ def getInt(key: String, default: Int, keyNum: Int = 1): Int = { val value = this.getString(key, default + "", keyNum) if (StringUtils.isNotBlank(value)) value.toInt else default } /** * 获取拼接后数值的配置长整数 * * @param key 配置的前缀 * @param keyNum 拼接到key后的数值后缀 * @return * 对应的配置信息 */ def getLong(key: String, default: Long, keyNum: Int = 1): Long = { this.get[Long](key, Some(default), keyNum) } /** * 获取float型数据 */ def getFloat(key: String, default: Float, keyNum: Int = 1): Float = { this.get[Float](key, Some(default), keyNum) } /** * 获取Double型数据 */ def getDouble(key: String, default: Double, keyNum: Int = 1): Double = { this.get[Double](key, Some(default), keyNum) } /** * 获取拼接后数值的配置布尔值 * * @param key 配置的前缀 * @param keyNum 拼接到key后的数值后缀 * @return * 对应的配置信息 */ def getBoolean(key: String, default: Boolean, keyNum: Int = 1): Boolean = { this.get[Boolean](key, Some(default), keyNum) } /** * 根据指定的key与key的num,获取对应的配置信息 * 1. 如果配置存在,则进行类型转换,返回T类型数据 * 2. 如果配置不存在,则取default参数作为默认值返回 * * @param key * 配置的key * @param default * 如果配置不存在,则取default只 * @param keyNum * 配置key的后缀编号 * @tparam T * 返回配置的类型 * @return */ def get[T: ClassTag](key: String, default: Option[T] = Option.empty, keyNum: Int = 1): T = { val value = this.getString(key, if (default.isDefined) default.get.toString else "", keyNum = keyNum) val paramType = getParamType[T] val property = tryWithReturn { paramType match { case _ if paramType eq classOf[Int] => value.toInt case _ if paramType eq classOf[Long] => value.toLong case _ if paramType eq classOf[Float] => value.toFloat case _ if paramType eq classOf[Double] => value.toDouble case _ if paramType eq classOf[Boolean] => value.toBoolean case _ => value } }(this.logger, catchLog = s"为找到配置信息:${key},请检查!") property.asInstanceOf[T] } /** * 使用map设置多个值 * * @param map * java map,存放多个配置信息 */ def setProperties(map: mutable.Map[String, String]): Unit = this.synchronized { if (map != null) map.foreach(kv => this.setProperty(kv._1, kv._2)) } /** * 使用map设置多个值 * * @param map * java map,存放多个配置信息 */ def setProperties(map: JMap[String, Object]): Unit = this.synchronized { if (map != null) { map.foreach(kv => { if (StringUtils.isNotBlank(kv._1) && kv._2 != null) { this.setProperty(kv._1, kv._2.toString) } }) } } /** * 设置指定的配置 * 注:其他均需要通过该API进行配置的设定,禁止直接调用:props.setProperty * * @param key * 配置的key * @param value * 配置的value */ def setProperty(key: String, value: String): Unit = this.synchronized { if (StringUtils.isNotBlank(key) && StringUtils.isNotBlank(value)) { this.setAdaptiveProperty(this.adaptiveKey(key), value) this.originalSettingsMap.put(key, value) } } /** * 添加自适应前缀的配置信息 */ private[fire] def setAdaptiveProperty(key: String, value: String): Unit = this.synchronized(this.adaptiveSettingsMap.put(key, value)) /** * 添加纯粹的配置信息,不会被自动加上引擎前缀 */ private[fire] def setNormalProperty(key: String, value: String): Unit = this.synchronized(this.originalSettingsMap.put(key, value)) /** * 隐蔽密码信息后返回 */ def cover: Map[String, String] = this.adaptiveSettingsMap.filter(t => !t._1.contains("pass")) /** * 打印配置文件中的kv */ def show(): Unit = { if (!FireFrameworkConf.fireConfShow) return LogUtils.logStyle(this.logger, "Fire configuration.")(logger => { this.adaptiveSettingsMap.foreach(key => { // 如果包含配置黑名单,则不打印 if (key != null && !FireFrameworkConf.fireConfBlackList.exists(conf => key.toString.contains(conf))) { logger.info(s">>${FirePS1Conf.PINK} ${key._1} --> ${key._2} ${FirePS1Conf.DEFAULT}") } }) }) } /** * 获所有的配置信息(包含经过自适应处理的配置) * * @return * confMap */ def settings: Map[String, String] = { val map = Map[String, String]() map ++= this.originalSettingsMap map ++= this.adaptiveSettingsMap map } /** * 获取经过适配前缀的配置信息 * * @return * confMap */ def adaptiveSettings: Map[String, String] = { val map = Map[String, String]() map ++= this.adaptiveSettingsMap map } /** * 获取原始的配置信息 * * @return * confMap */ def originalSettings: Map[String, String] = { val map = Map[String, String]() map ++= this.originalSettingsMap map } /** * 指定key的前缀获取所有该前缀的key与value */ def sliceKeys(keyStart: String): immutable.Map[String, String] = { if (!this.cachedConfMap.contains(keyStart)) { val confMap = new mutable.HashMap[String, String]() this.adaptiveSettingsMap.foreach(key => { val adaptiveKeyStar = this.adaptiveKey(keyStart) if (key._1.contains(adaptiveKeyStar)) { val keySuffix = key._1.substring(adaptiveKeyStar.length) confMap.put(keySuffix, key._2) } }) this.cachedConfMap.put(keyStart, confMap.toMap) } this.cachedConfMap(keyStart) } /** * 根据keyNum选择对应的kafka配置 */ def sliceKeysByNum(keyStart: String, keyNum: Int = 1): collection.immutable.Map[String, String] = { // 用于匹配以指定keyNum结尾的key val reg = "\\D" + keyNum + "$" val map = new mutable.HashMap[String, String]() this.sliceKeys(keyStart).foreach(kv => { val keyLength = kv._1.length val keyNumStr = keyNum.toString // 末尾匹配keyNum并且keyNum的前一位非整数 val isMatch = reg.r.findFirstMatchIn(kv._1).isDefined // 提前key,如key=session.timeout.ms33,则提前后的key=session.timeout.ms val trimKey = if (isMatch) kv._1.substring(0, keyLength - keyNumStr.length) else kv._1 // 配置的key的末尾与keyNum匹配 if (isMatch) { map += (trimKey -> kv._2) } else if (keyNum <= 1) { // 匹配没有数字后缀的key,session.timeout.ms与session.timeout.ms1认为是同一个配置 val lastChar = kv._1.substring(keyLength - 1, keyLength) // 如果配置的结尾是字母 if (!StringsUtils.isInt(lastChar)) { map += (kv._1 -> kv._2) } } }) map.toMap } /** * 合并Conf中的配置信息 */ @Internal private[this] def mergeEngineConf: Unit = { val clazz = Class.forName(FireFrameworkConf.FIRE_ENGINE_CONF_HELPER) val method = clazz.getDeclaredMethod("syncEngineConf") val map = method.invoke(null).asInstanceOf[immutable.Map[String, String]] if (map.nonEmpty) { this.setProperties(map.filter(kv => !kv._1.contains(FireFrameworkConf.FIRE_REST_SERVER_SECRET))) logger.info(s"完成计算引擎配置信息的同步,总计:${map.size}条") map.foreach(k => logger.debug("合并:k=" + k._1 + " v=" + k._2)) } } /** * 获取指定类的配置注解信息(@FireConf优先级高于@Config注解) * * @param clazz * flink或spark任务的具体入口类 * @return * 配置文件名称 & 配置列表 */ @Internal private[this] def getAnnoConfig(clazz: Class[_]): Option[(Array[String], Array[(String, String)], String)] = { import com.zto.fire.common.anno.FireConf val annoConfig = ReflectionUtils.getClassAnnotation(clazz, classOf[Config]) val annoFireConfig = ReflectionUtils.getClassAnnotation(clazz, classOf[FireConf]) val fireArray, allProps = ArrayBuffer[String]() val confText = new mutable.StringBuilder() if (annoConfig != null) { val confAnno = annoConfig.asInstanceOf[Config] fireArray ++= confAnno.files() allProps ++= confAnno.props() confText.append(confAnno.value()) } if (annoFireConfig != null) { val fireConfAnno = annoFireConfig.asInstanceOf[FireConf] fireArray ++= fireConfAnno.files() allProps ++= fireConfAnno.props() confText.append(fireConfAnno.value()) } val files = fireArray.filter(StringUtils.isNotBlank).map(_.trim) // 获取通过@Config与@FireConf配置的所有参数 val props = allProps.filter(StringUtils.isNotBlank) .map(_.split("=", 2)) .filter(prop => noEmpty(prop) && prop.length == 2 && noEmpty(prop(0), prop(1))) .map(prop => { (prop(0).trim, prop(1).trim) }) Some(files.toArray, props.toArray, confText.toString()) } /** * 调用外部配置中心接口获取配合信息 */ @Internal private[this] def invokeConfigCenter(className: String): JMap[ConfigureLevel, JMap[String, String]] = { // 调用配置中心接口获取优先级最高的配置信息 ConfigurationCenterManager.invokeConfigCenter(className) } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/RegularUtils.scala ================================================ package com.zto.fire.common.util /** * 常用的正则表达式 * * @author ChengLong 2021-5-28 11:14:19 * @since fire 2.0.0 */ object RegularUtils { // 用于匹配纯数值的表达式 lazy val numeric = "(^[1-9]\\d*\\.?\\d*$)|(^0\\.\\d*[1-9]$)".r // 用于匹配字符串中以数值开头的数值 lazy val numericPrefix = "(^[1-9]\\d*\\.?\\d*)|(^0\\.\\d*[1-9])".r // 用于匹配字符串中以固定的字母+空白符结尾 lazy val unitSuffix = "[a-zA-Z]+\\s*$".r // 用于匹配使用#号作为注释的所有结尾 lazy val propAnnotation = "\\s+\\#.*".r // 用于匹配insert语句 lazy val insertReg = "^\\s*INSERT.*".r // 用于匹配sql中的with表达式的value lazy val withValueReg = """=\s*'.+'""".r // 用于匹配sql中with表达式value具体的值 lazy val valueReg = """'.+'""".r // 用于匹配flink sql with表达式中数据源别名 lazy val withDatasourceReg = """'datasource'\s*=\s*'[A-Za-z0-9_]+'""".r // 多条sql语句的截取 lazy val sqlSplit = """;\s""" } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/SQLLineageManager.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import com.zto.fire.common.bean.TableIdentifier import com.zto.fire.common.bean.lineage.{SQLTableColumns, _} import com.zto.fire.predef._ /** * SQL血缘解析管理器,协助快速构建SQL血缘信息 * * @author ChengLong 2022-09-01 15:10:38 * @since 2.2.3 */ private[fire] object SQLLineageManager { private lazy val statementSet = new JHashSet[String]() private lazy val relationSet = new JHashSet[SQLTableRelations]() private lazy val tableLineageMap = new JConcurrentHashMap[String, SQLTable]() /** * 添加待执行的SQL语句 */ def addStatement(statement: String): Unit = { if (noEmpty(statement)) this.statementSet.add(statement.trim) } /** * 维护表与表之间的关系 * * @param srcTable * 数据来源表 * @param sinkTable * 目标表 */ def addRelation(srcTableIdentifier: TableIdentifier, sinkTableIdentifier: TableIdentifier): Unit = { this.relationSet.add(new SQLTableRelations(srcTableIdentifier.toString, sinkTableIdentifier.toString)) } /** * 获取SQL血缘信息 */ def getSQLLineage: SQLLineage = { val sqlLineage = new SQLLineage() sqlLineage.setStatements(this.statementSet.toList) sqlLineage.setTables(this.tableLineageMap.values().toList) sqlLineage.setRelations(this.relationSet.toList) sqlLineage } /** * 根据给定的库表名称获取完整表名 * * @param dbName * 数据库名称(可为空) * @param tableName * 表名 * @return * dbName.tableName */ def getTableIdentify(tableIdentifier: TableIdentifier): String = { requireNonEmpty(tableIdentifier.table, "表名不能为空") tableIdentifier.toString } /** * 根据库表信息获取SQLTable实例 * * @param dbName * 数据库名称,可为空 * @param tableName * 表名 * @return * SQLTable */ def getTableInstance(tableIdentifier: TableIdentifier): SQLTable = { this.tableLineageMap.mergeGet(getTableIdentify(tableIdentifier)) { new SQLTable(tableIdentifier.toString) } } /** * 用于为指定的SQLTable对象添加必要的字段值 */ private[this] def setTableField(tableIdentifier: TableIdentifier)(fun: SQLTable => Unit): SQLTable = { val table = this.getTableInstance(tableIdentifier) fun(table) table } /** * 为指定的表添加options信息 * * @param options * 选项信息 */ def setOptions(tableIdentifier: TableIdentifier, options: Map[String, String]): SQLTable = { this.setTableField(tableIdentifier) { _.getOptions.putAll(options) } } /** * 为指定的表添加操作信息 * * @param operations * 操作类型信息(INSERT、DROP等) */ def setOperation(tableIdentifier: TableIdentifier, operations: String*): SQLTable = { this.setTableField(tableIdentifier) { _.getOperation.addAll(operations) } } /** * 为指定的表添加使用到的字段信息 * * @param columns * 字段列表 */ def setColumns(tableIdentifier: TableIdentifier, columns: Seq[(String, String)]): SQLTable = { this.setTableField(tableIdentifier) { _.getColumns.addAll(columns.map(t => new SQLTableColumns(t._1, t._2))) } } /** * 为指定的表添加使用到的分区信息 * * @param partitions * 分区列表 */ def setPartitions(tableIdentifier: TableIdentifier, partitions: Seq[(String, String)]): SQLTable = { this.setTableField(tableIdentifier) { _.getPartitions.addAll(partitions.map(t => new SQLTablePartitions(t._1, t._2))) } } /** * 为指定的表添加catalog信息 * * @param catalog * catalog信息:hive、kafka、jdbc等 */ def setCatalog(tableIdentifier: TableIdentifier, catalog: String): SQLTable = { this.setTableField(tableIdentifier) { _.setCatalog(catalog) } } /** * 为指定的表添加comment信息 * * @param comment * 表注释信息 */ def setComment(tableIdentifier: TableIdentifier, comment: String): SQLTable = { this.setTableField(tableIdentifier) { _.setComment(comment) } } /** * 为指定的表添加catalog的集群url * * @param cluster * 集群地址 */ def setCluster(tableIdentifier: TableIdentifier, cluster: String): SQLTable = { this.setTableField(tableIdentifier) { _.setCluster(cluster) } } /** * 为指定的表添加catalog的具体物理表名 * * @param physicalTable * 真实的表名 */ def setPhysicalTable(tableIdentifier: TableIdentifier, physicalTable: String): SQLTable = { this.setTableField(tableIdentifier) { _.setPhysicalTable(physicalTable) } } /** * 为指定的表添加视图名称 * * @param tmpView * spark或flink任务内部注册的临时表名 */ def setTmpView(tableIdentifier: TableIdentifier, tmpView: String): SQLTable = { this.setTableField(tableIdentifier) { _.setTmpView(tmpView) } } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/SQLUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import com.zto.fire.predef._ import org.apache.commons.lang3.StringUtils import scala.collection.mutable.ListBuffer /** * SQL相关工具类 * * @author ChengLong * @since 1.1.2 * @create 2020-11-26 15:09 */ object SQLUtils extends Logging { private[this] val beforeWorld = "(?i)(from|join|update|into table|table|into|exists|desc|like|if)" private[this] val reg = s"${beforeWorld}\\s+(\\w+\\.\\w+|\\w+)".r /** * 利用正则表达式解析SQL中用到的表名 */ def tableParse(sql: String): ListBuffer[String] = { require(StringUtils.isNotBlank(sql), "sql语句不能为空") val tables = ListBuffer[String]() // 找出所有beforeWorld中定义的关键字匹配到的后面的表名 reg.findAllMatchIn(sql.replace("""`""", "")).foreach(tableName => { // 将匹配到的数据剔除掉beforeWorld中定义的关键字 val name = tableName.toString().replaceAll(s"${beforeWorld}\\s+", "").trim if (StringUtils.isNotBlank(name)) tables += name }) tables } /** * 执行多条sql语句,以分号分割 */ def executeSql[T](sql: String)(block: String => T): Option[T] = { require(StringUtils.isNotBlank(sql), "待执行的sql语句不能为空") var result: Option[T] = None sql.split(RegularUtils.sqlSplit).filter(noEmpty(_)).foreach(statement => { if (noEmpty(statement)) { logger.debug("当前执行sql:\n" + statement) result = Some(block(statement)) } }) result } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/ScalaUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import scala.reflect.{ClassTag, classTag} import scala.runtime.Nothing$ /** * scala工具类 * * @author ChengLong * @since 2.0.0 * @create 2021-01-04 14:06 */ trait ScalaUtils { /** * 获取泛型具体的类型 * * @tparam T * 泛型类型 * @return * Class[T] */ def getParamType[T: ClassTag]: Class[T] = { val paramType = classTag[T].runtimeClass.asInstanceOf[Class[T]] if (paramType == classOf[Nothing$]) throw new IllegalArgumentException("不合法的方法调用,请在方法调用时指定泛型!") paramType } /** * 用于判断给定的类是否为object * @return * true:对象或半生对象 false:class */ def isObject(clazz: Class[_]): Boolean = clazz.getName.endsWith("$") } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/ShutdownHookManager.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import com.zto.fire.predef._ import java.util.PriorityQueue import java.util.concurrent.atomic.AtomicBoolean /** * Fire框架统一的shutdown hook管理器,所有注册了的hook将会在jvm退出前根据优先级依次调用 * * @author ChengLong * @create 2020-11-20 14:06 * @since 1.1.2 */ private[fire] class ShutdownHookManager extends Logging { // 具有优先级的队列,存放各处注册的hook信息,在jvm退出前根据优先级依次调用 private[this] val hooks = new PriorityQueue[HookEntry]() private[this] val shuttingDown = new AtomicBoolean(false) /** * 执行所有的hook */ def runAll: Unit = { if (this.shuttingDown.compareAndSet(false, true)) { var nextHook: HookEntry = null while ( { nextHook = hooks.synchronized { hooks.poll() }; nextHook != null }) { // 调用每一个hook的run方法 tryWithLog(nextHook.run())(this.logger, tryLog = "Fire shutdown hook executed.", catchLog = "执行hook过程中发生例外.") } } } /** * install所有的hook */ def install: Unit = { Runtime.getRuntime.addShutdownHook(new Thread() { // 调用hooks中的所有hook的run方法,每个run都会被try/cache包围 override def run(): Unit = runAll }) } /** * 添加指定优先级的hook */ def add(priority: Int, hook: () => Unit): Unit = { this.hooks.synchronized { if (this.shuttingDown.get()) throw new IllegalStateException("Shutdown hooks 在关闭过程中无法注册新的hook") this.hooks.add(new HookEntry(priority, hook)) } } /** * 移除指定的hook */ def remove(ref: AnyRef): Unit = { this.hooks.synchronized { this.hooks.remove(ref) } } } /** * hook项,包含优先级与具体的hook逻辑 * * @param priority * hook优先级,优先级高的会先被调用 * @param hook * hook具体的执行逻辑,比如用于关闭数据库连接等 */ private[fire] class HookEntry(private val priority: Int, hook: () => Unit) extends Comparable[HookEntry] { /** * hook执行顺序的优先级比较 */ override def compareTo(o: HookEntry): Int = o.priority - this.priority /** * run方法中调用hook函数 */ def run(): Unit = hook() } /** * Fire框架统一的shutdown hook管理器 * 调用者可以基于提供的api进行hook的注册 */ object ShutdownHookManager { // 优先级定义 lazy val DEFAULT_PRIORITY = 10 private[fire] lazy val HEIGHT_PRIORITY = 100 val LOW_PRIORITY = 5 private[this] lazy val hookManager = new ShutdownHookManager() this.hookManager.install def addShutdownHook(priority: Int = DEFAULT_PRIORITY)(hook: () => Unit): Unit = { hookManager.add(priority, hook) } def removeShutdownHook(ref: AnyRef): Unit = this.hookManager.remove(ref) } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/ThreadUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import com.zto.fire.common.conf.{FireFrameworkConf, FirePS1Conf} import com.zto.fire.common.enu.ThreadPoolType import com.zto.fire.predef._ import org.apache.commons.lang3.StringUtils import java.util.concurrent._ /** * 线程相关工具类 * * @author ChengLong 2019-4-25 15:17:55 */ object ThreadUtils extends Logging { // 用于维护使用ThreadUtils创建的线程池对象,并进行统一的关闭 private lazy val poolMap = new JConcurrentHashMap[String, ExecutorService]() private lazy val singlePool = this.createThreadPool("FireSinglePool", ThreadPoolType.SINGLE) private lazy val cachedPool = this.createThreadPool("FireCachedPool", ThreadPoolType.CACHED) private lazy val scheduledPool = this.createThreadPool("FireScheduledPool", ThreadPoolType.SCHEDULED, FireFrameworkConf.threadPoolSchedulerSize).asInstanceOf[ScheduledExecutorService] /** * 利用SingleThreadExecutor执行给定的函数 * * @param fun * 用于指定以多线程方式执行的函数 */ def runAsSingle(fun: => Unit): Unit = { this.singlePool.execute(new Runnable { override def run(): Unit = fun }) } /** * 利用CachedThreadPool执行给定的函数 * * @param fun * 用于指定以多线程方式执行的函数 */ def run(fun: => Unit): Unit = { this.cachedPool.execute(new Runnable { override def run(): Unit = fun logger.debug(s"Invoke runAsThread as ${Thread.currentThread().getName}.") }) } /** * 利用CachedThreadPool循环执行给定的函数 * * @param fun * 用于指定以多线程方式执行的函数 * @param delay * 循环调用间隔时间(单位s) */ def runLoop(fun: => Unit, delay: Long = 10): Unit = { this.cachedPool.execute(new Runnable { override def run(): Unit = { while (true) { fun logger.debug(s"Loop invoke runAsThreadLoop as ${Thread.currentThread().getName}. Delay is ${delay}s.") Thread.sleep(delay * 1000) } } }) } /** * 利用ScheduledThreadPool定时调度执行给定的函数 * * @param fun * 定时执行的任务函数引用 * @param initialDelay * 第一次延迟执行的时长 * @param period * 每隔指定的时长执行一次 * @param rate * true:表示周期性的执行,不受上一个定时任务的约束 * false:表示当上一次周期性任务执行成功后,period后开始执行 * @param timeUnit * 时间单位,默认分钟 */ def schedule(fun: => Unit, initialDelay: Long, period: Long, rate: Boolean = true, timeUnit: TimeUnit = TimeUnit.MINUTES): Unit = { if (rate) { // 表示周期性的执行,不受上一个定时任务的约束 this.scheduledPool.scheduleAtFixedRate(new Runnable { override def run(): Unit = wrapFun() }, initialDelay, period, timeUnit) } else { // 表示当上一次周期性任务执行成功后,period后开始执行 this.scheduledPool.scheduleWithFixedDelay(new Runnable { override def run(): Unit = wrapFun() }, initialDelay, period, timeUnit) } // 处理传入的函数 def wrapFun(): Unit = { fun this.logger.debug(s"Loop invoke runAsSchedule as ${Thread.currentThread().getName}. Delay is ${period}${timeUnit.name()}.") } } /** * 表示当上一次周期性任务执行成功后 * period后开始执行给定的函数fun * * @param fun * 定时执行的任务函数引用 * @param initialDelay * 第一次延迟执行的时长 * @param period * 每隔指定的时长执行一次 * @param timeUnit * 时间单位,默认分钟 */ def scheduleAtFixedRate(fun: => Unit, initialDelay: Long, period: Long, timeUnit: TimeUnit = TimeUnit.MINUTES): Unit = { this.schedule(fun, initialDelay, period, true, timeUnit) } /** * 表示当上一次周期性任务执行成功后,period后开始执行fun函数 * 注:受上一个定时任务的影响 * * @param fun * 定时执行的任务函数引用 * @param initialDelay * 第一次延迟执行的时长 * @param period * 每隔指定的时长执行一次 * @param timeUnit * 时间单位,默认分钟 */ def scheduleWithFixedDelay(fun: => Unit, initialDelay: Long, period: Long, timeUnit: TimeUnit = TimeUnit.MINUTES): Unit = { this.schedule(fun, initialDelay, period, false, timeUnit) } /** * 创建一个新的指定大小的调度线程池 * 如果名称已存在,则直接返回对应的线程池 * * @param poolName * 线程池标识 * @param poolType * 线程池类型 * @param poolSize * 线程池大小 */ def createThreadPool(poolName: String, poolType: ThreadPoolType = ThreadPoolType.FIXED, poolSize: Int = 1): ExecutorService = { require(StringUtils.isNotBlank(poolName), "线程池名称不能为空") if (this.poolMap.containsKey(poolName)) { this.poolMap.get(poolName) } else { val threadPool = poolType match { case ThreadPoolType.FIXED => Executors.newFixedThreadPool(poolSize) case ThreadPoolType.SCHEDULED => Executors.newScheduledThreadPool(poolSize) case ThreadPoolType.SINGLE => Executors.newSingleThreadExecutor() case ThreadPoolType.CACHED => Executors.newCachedThreadPool() case ThreadPoolType.WORK_STEALING => Executors.newWorkStealingPool() case _ => Executors.newFixedThreadPool(poolSize) } this.poolMap.put(poolName, threadPool) threadPool } } /** * 用于释放指定的线程池 * * @param poolName * 线程池标识 */ def shutdown(poolName: String): Unit = { if (StringUtils.isNotBlank(poolName) && this.poolMap.containsKey(poolName)) { val threadPool = this.poolMap.get(poolName) if (threadPool != null && !threadPool.isShutdown) { threadPool.shutdownNow() this.logger.debug(s"关闭线程池:${poolName}") } } } /** * 用于释放指定的线程池 */ def shutdown(pool: ExecutorService): Unit = { if (pool != null && !pool.isShutdown) { pool.shutdown() this.logger.debug(s"关闭线程池:${pool}") } } /** * 用于释放所有线程池 */ private[fire] def shutdown: Unit = { val poolNum = this.poolMap.size() if (this.poolMap.size() > 0) { this.poolMap.foreach(pool => { if (pool != null && pool._2 != null && !pool._2.isShutdown) { pool._2.shutdownNow() this.logger.info(s"${FirePS1Conf.GREEN}---> 完成线程池[ ${pool._1} ]的资源回收. <---${FirePS1Conf.DEFAULT}") } }) } this.logger.info(s"${FirePS1Conf.PINK}---> 完成所有线程池回收,总计:${poolNum}个. <---${FirePS1Conf.DEFAULT}") } } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/Tools.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import com.zto.fire.common.ext.{JavaExt, ScalaExt} import scala.collection.convert.{WrapAsJava, WrapAsScala} import scala.util.control.Breaks /** * 各种工具API的集合类 * * @author ChengLong * @since 1.0.0 * @create 2020-12-16 16:23 */ trait Tools extends Breaks with JavaTypeMap with ValueCheck with FireFunctions with JavaExt with ScalaExt with ScalaUtils with WrapAsScala with WrapAsJava { } ================================================ FILE: fire-common/src/main/scala/com/zto/fire/common/util/ValueUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import java.util import com.zto.fire.predef._ import org.apache.commons.lang3.StringUtils /** * 值校验工具,支持任意对象、字符串、集合、map、rdd、dataset是否为空的校验 * * @since 0.4.1 * @author ChengLong 2019-9-4 13:39:16 */ private[fire] trait ValueCheck { /** * 值为空判断,支持任意类型 * * @param params * 参数值 * @return * true:empty false:not empty */ def isEmpty(params: Any *): Boolean = { if (params == null || params.isEmpty) return true params.map { case null => true case str: String => StringUtils.isBlank(str) case array: Array[_] => array.isEmpty case collection: util.Collection[_] => collection.isEmpty case it: Iterable[_] => it.isEmpty case map: JMap[_, _] => map.isEmpty case _ => false }.count(_ == true) > 0 } /** * 值为非空判断,支持任意类型 * * @param param * 参数值 * @return * true:not empty false:empty */ def noEmpty(param: Any *): Boolean = !this.isEmpty(param: _*) /** * 参数非空约束,仅检查是否存在null引用 * * @param params 参数列表信息 * @param message 异常信息 */ def requireNonNull(params: Any*)(implicit message: String = "参数不能为空,请检查."): Unit = { require(params != null && params.nonEmpty, message) var index = 0 params.foreach(param => { index += 1 param match { case null => require(param != null, msg(index, message)) case _ => } }) /** * 构建异常信息 */ def msg(index: Int, msg: String): String = s"第[ ${index} ]参数为null,异常信息:$message" } /** * 参数非空约束(严格模式,进一步验证集合是否有元素) * * @param params 参数列表信息 * @param message 异常信息 */ def requireNonEmpty(params: Any*)(implicit message: String = "参数不能为空,请检查."): Unit = { require(params != null && params.nonEmpty, message) var index = 0 params.foreach(param => { index += 1 param match { case null => require(param != null, msg(index, message)) case str: String => require(StringUtils.isNotBlank(str), msg(index, message)) case array: Array[_] => require(array.nonEmpty, msg(index, message)) case collection: util.Collection[_] => require(!collection.isEmpty, msg(index, message)) case it: Iterable[_] => require(it.nonEmpty, msg(index, message)) case map: JMap[_, _] => require(map.nonEmpty, msg(index, message)) case _ => } }) /** * 构建异常信息 */ def msg(index: Int, msg: String): String = s"第[ ${index} ]参数为空,异常信息:$message" } } /** * 用于单独调用的值校验工具类 */ object ValueUtils extends ValueCheck ================================================ FILE: fire-common/src/test/scala/com/zto/fire/common/util/RegularUtilsUnitTest.scala ================================================ package com.zto.fire.common.util import com.zto.fire.common.anno.TestStep import org.junit.Test import java.io.StringReader import java.util.Properties /** * 常用的正则表达式 * * @author ChengLong 2022-05-12 17:20:55 * @since fire 2.2.2 */ class RegularUtilsUnitTest { @Test @TestStep(step = 1, desc = "@Config注解中的注释解析单元测试") def testPropAnnotation: Unit = { val conf = """ |# hello world | # hello world | # 注释 | # 注释 |#fire framework |# fire framework | #fire framework | hive.cluster=batch |kafka.brokers1 = test#$kafka |kafka.brokers2=test # 注释kafka |#kafka.brokers3=test # kafka |""".stripMargin val normalValue = RegularUtils.propAnnotation.replaceAllIn(conf, "").replaceAll("\\|", "").trim println(normalValue) val valueProps = new Properties() val stringReader = new StringReader(normalValue) valueProps.load(stringReader) stringReader.close() assert(valueProps.size() == 3) assert(valueProps.getProperty("kafka.brokers1").equals("test#$kafka")) assert(valueProps.getProperty("kafka.brokers2").equals("test")) } @Test @TestStep(step = 2, desc = "用于测试insert的sql语句") def testInsetReg: Unit = { val sql1 = "insert into" val sql2 = "INSERT into" val sql3 = " insert asf " val sql4 = """insert into""" val sql5 = """ |insert into |""".stripMargin val sql6 = """ | insert into |""".stripMargin val sql7 = """ | | insert into |""".stripMargin val sqls = Seq(sql1, sql2, sql3, sql4, sql5, sql6, sql7) // 用于匹配使用#号作为注释的所有结尾 sqls.foreach(sql => { assert(RegularUtils.insertReg.findFirstIn(sql.toUpperCase).isDefined) }) val sql8 = """ |create xxx -- insert |""".stripMargin assert(RegularUtils.insertReg.findFirstIn(sql8.toUpperCase).isEmpty) val sql9 = """ |c insert |""".stripMargin assert(RegularUtils.insertReg.findFirstIn(sql9.toUpperCase).isEmpty) } } ================================================ FILE: fire-common/src/test/scala/com/zto/fire/common/util/SQLUtilsTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import org.junit.Test import com.zto.fire.common.util.SQLUtils._ /** * SQLUtils单元测试 * * @author ChengLong * @since 1.0.0 * @create 2020-11-26 15:11 */ class SQLUtilsTest { @Test def testParse: Unit = { val selectSql = """ | select * FROM | student1 s join dev.teacher2 b |""".stripMargin tableParse(selectSql).foreach(tableName => println("匹配:" + tableName)) val insertSQL = """ |insert into dev.student3(id,name) values(1, 'root'); |insert into teacher4(id,name) values(1, 'root'); |""".stripMargin tableParse(insertSQL).foreach(tableName => println("匹配:" + tableName)) val deleteSQL = """ |delete from teacher5 where id=10; |delete from dev.teacher6 where id=10; |""".stripMargin tableParse(deleteSQL).foreach(tableName => println("匹配:" + tableName)) val createSQL = """ |create table hello7(idxxx); |create table if not EXISTS hello8; |CREATE TABLE student9 LIKE tmp.student10 |""".stripMargin tableParse(createSQL).foreach(tableName => println("匹配:" + tableName)) val alterSQL = """ |LOAD DATA LOCAL INPATH '/home/hadoop/data/student1.txt' INTO TABLE student11 |""".stripMargin tableParse(alterSQL).foreach(tableName => println("匹配:" + tableName)) val testSQL = """ |create table table_student12 |insert into dev.student13_from |delete from `from_student14_from` |select * from (select * from student15) |select * from (select * from |student16) |""".stripMargin tableParse(testSQL).foreach(tableName => println("匹配:" + tableName)) val start = System.currentTimeMillis() (1 to 1000).foreach(i => tableParse(selectSql)) println("耗时:" + (System.currentTimeMillis() - start)) } } ================================================ FILE: fire-common/src/test/scala/com/zto/fire/common/util/ShutdownHookManagerTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import org.apache.log4j.{Level, Logger} import org.junit.Test /** * shutdown hook管理器单元测试 * * @author ChengLong * @since 1.1.2 * @create 2020-11-20 14:45 */ class ShutdownHookManagerTest { Logger.getLogger(classOf[ShutdownHookManagerTest]).setLevel(Level.toLevel("INFO")) @Test def testRegister: Unit = { ShutdownHookManager.addShutdownHook(1) { () => println("1. 执行逻辑") } ShutdownHookManager.addShutdownHook(3) { () => println("3. 执行逻辑") } ShutdownHookManager.addShutdownHook(2) { () => println("2. 执行逻辑") } ShutdownHookManager.addShutdownHook(5) { () => println("5. 执行逻辑") } println("=========main method==========") } } ================================================ FILE: fire-common/src/test/scala/com/zto/fire/common/util/ValueUtilsTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.common.util import com.zto.fire.predef._ import org.junit.Test /** * ValueUtils工具类单元测试 * * @author ChengLong * @since 1.0.0 * @create 2020-12-16 13:21 */ class ValueUtilsTest { /** * 测试isEmpty、isNotEmpty等API */ @Test def testIsEmpty(): Unit = { val str = "" assert(isEmpty(str), "字符串不能为空") val map = new JHashMap[String, Integer]() assert(isEmpty(str, map), "存在为空的值") map.put("1", 1) assert(noEmpty("123", map), "都不为空") assert(!noEmpty("123", map, ""), "存在为空的") } /** * 测试参数检测API */ @Test def testRequireNonEmpty(): Unit = { val arr = new Array[Int](1) val map = Map("str" -> 1) val mutableMap = scala.collection.mutable.Map("str" -> 1) val jmap = new JHashMap[String, Integer]() jmap.put("str", 1) val jset = new JHashSet[Int]() jset.add(1) requireNonEmpty(arr, map, mutableMap, jmap, jset)("参数不合法") } /** * 测试参数检测API */ @Test def testRequireNonNull(): Unit = { val arr = new Array[Int](1) val map = Map("str" -> 1) val mutableMap = scala.collection.mutable.Map("str" -> 1) val jmap = new JHashMap[String, Integer]() jmap.put("str", 1) val jset = new JHashSet[Int]() jset.add(1) requireNonNull(arr, map, mutableMap, jmap, jset)("参数不合法") } } ================================================ FILE: fire-connectors/.gitignore ================================================ # use glob syntax. syntax: glob *.ser *.class *~ *.bak #*.off *.old *.lck *.txt # eclipse conf file .settings .classpath .project .manager .scala_dependencies # idea .idea *.iml # building target build null tmp* temp* dist test-output build.log # other scm .svn .CVS .hg* # switch to regexp syntax. # syntax: regexp # ^\.pc/ #SHITTY output not in target directory build.log ================================================ FILE: fire-connectors/base-connectors/fire-hbase/pom.xml ================================================ 4.0.0 fire-connector-hbase_${scala.binary.version} jar Fire : Connectors : Common : HBase com.zto.fire fire-connectors-common 2.3.2-SNAPSHOT ../pom.xml org.apache.hadoop hadoop-common ${hadoop.version} ${maven.scope} org.apache.hadoop hadoop-hdfs ${hadoop.version} ${maven.scope} org.apache.hadoop hadoop-client ${hadoop.version} ${maven.scope} org.apache.hbase hbase-common ${hbase.version} ${maven.scope} org.apache.hbase hbase-client org.apache.hbase hbase-server ${hbase.version} ${maven.scope} org.apache.hbase hbase-client org.apache.hbase hbase-client ${hbase.version} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-connectors/base-connectors/fire-hbase/src/main/java/com/zto/fire/hbase/anno/HConfig.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.hbase.anno; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * HBase相关的配置 * @author ChengLong 2020-11-16 16:03:08 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface HConfig { /** * 是否允许空字段插入HBase */ boolean nullable() default true; /** * 是否以多版本方式插入 * 注:fire中将数据转为json后以多版本方式插入,因此多列数据最终存放到HBase中只是一列json数据 */ boolean multiVersion() default false; /** * 默认获取的版本数 */ int versions() default 1; } ================================================ FILE: fire-connectors/base-connectors/fire-hbase/src/main/scala/com/zto/fire/hbase/HBaseConnector.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.hbase import com.google.common.collect.Maps import com.zto.fire.common.anno.{FieldName, Internal} import com.zto.fire.common.enu.{Operation => FOperation} import com.zto.fire.common.util._ import com.zto.fire.core.connector.{ConnectorFactory, FireConnector} import com.zto.fire.hbase.anno.HConfig import com.zto.fire.hbase.bean.{HBaseBaseBean, MultiVersionsBean} import com.zto.fire.hbase.conf.FireHBaseConf import com.zto.fire.hbase.conf.FireHBaseConf._ import com.zto.fire.predef._ import org.apache.commons.lang3.StringUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase._ import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.io.compress.Compression import org.apache.hadoop.hbase.util.Bytes import java.lang.reflect.Field import java.lang.{Boolean => JBoolean, Double => JDouble, Float => JFloat, Integer => JInt, Long => JLong, Short => JShort, String => JString} import java.math.{BigDecimal => JBigDecimal} import java.nio.charset.StandardCharsets import java.util.concurrent.{ScheduledExecutorService, TimeUnit, ConcurrentHashMap => JConcurrentHashMap} import java.util.{Map => JMap} import scala.collection.Iterator import scala.collection.mutable.ListBuffer import scala.reflect.{ClassTag, classTag} /** * HBase操作工具类,除了涵盖CRUD等常用操作外,还提供以下功能: * 1. static void insert(String tableName, String family, List list) * 将自定义的javabean集合批量插入到表中 * 2. scan[T <: HBaseBaseBean[T]](tableName: String, scan: Scan, clazz: Class[T], keyNum: Int = 1): ListBuffer[T] * 指定查询条件,将查询结果以List[T]形式返回 * 注:自定义bean中的field需与hbase中的qualifier对应 *

* * @param conf * 代码级别的配置信息,允许为空,配置文件会覆盖相同配置项,也就是说配置文件拥有着跟高的优先级 * @param keyNum * 用于区分连接不同的数据源,不同配置源对应不同的Connector实例 * @since 2.0.0 * @author ChengLong 2020-11-11 */ class HBaseConnector(val conf: Configuration = null, val keyNum: Int = 1) extends FireConnector(keyNum = keyNum) { // --------------------------------------- 反射缓存 --------------------------------------- // private[this] var configuration: Configuration = _ private[this] lazy val cacheFieldMap = new JConcurrentHashMap[Class[_], JMap[String, Field]]() private[this] lazy val cacheHConfigMap = new JConcurrentHashMap[Class[_], HConfig]() private[this] lazy val cacheTableExistsMap = new JConcurrentHashMap[String, Boolean]() private[this] lazy val connection: Connection = this.initConnection private[this] lazy val durability = this.initDurability // ------------------------------------ 表存在判断缓存 ------------------------------------ // private[this] lazy val tableExistsCacheEnable = tableExistsCache(this.keyNum) private[this] lazy val closeAdminError = "close admin执行失败" this.registerReload /** * 批量插入多行多列,自动将HBaseBaseBean子类转为Put集合 * * @param tableName 表名 * @param beans HBaseBaseBean子类集合 */ def insert[T <: HBaseBaseBean[T] : ClassTag](tableName: String, beans: T*): Unit = { requireNonNull(tableName, beans)("参数不合法,批量HBase insert失败") var table: Table = null tryFinallyWithReturn { table = this.getTable(tableName) val beanList = if (this.getMultiVersion[T]) beans.filter(_ != null).map((bean: T) => new MultiVersionsBean(bean)) else beans val putList = beanList.map(bean => convert2Put(bean.asInstanceOf[T], this.getNullable[T])) this.insert(tableName, putList: _*) } { this.closeTable(table) }(this.logger, catchLog = s"HBase insert ${hbaseClusterUrl(keyNum)}.${tableName}执行失败, 总计${beans.size}条", finallyCatchLog = "close HBase table失败") } /** * 批量插入多行多列 * * @param tableName 表名 * @param puts Put集合 */ def insert(tableName: String, puts: Put*): Unit = { requireNonNull(tableName, puts)("参数不合法,批量HBase insert失败") var table: Table = null tryFinallyWithReturn { table = this.getTable(tableName) table.put(puts) LineageManager.addDBDatasource("HBase", hbaseClusterUrl(keyNum), tableName, operation = FOperation.INSERT) this.logger.info(s"HBase insert ${hbaseClusterUrl(keyNum)}.${tableName}执行成功, 总计${puts.size}条") } { this.closeTable(table) }(this.logger, "HBase insert", s"HBase insert ${hbaseClusterUrl(keyNum)}.${tableName}执行失败, 总计${puts.size}条", "close HBase table失败") } /** * 从HBase批量Get数据,并将结果封装到JavaBean中 * * @param tableName 表名 * @param rowKeys 指定的多个rowKey * @param clazz 目标类类型,必须是HBaseBaseBean的子类 * @return 目标对象实例 */ def get[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], rowKeys: String*): ListBuffer[T] = { val getList = for (rowKey <- rowKeys) yield HBaseConnector.buildGet(rowKey) this.get[T](tableName, clazz, getList: _*) } /** * 从HBase批量Get数据,并将结果封装到JavaBean中 * * @param tableName 表名 * @param clazz 目标类类型,必须是HBaseBaseBean的子类 * @param gets 指定的多个get对象 * @return 目标对象实例 */ def get[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], gets: Get*)(implicit canOverload: Boolean = true): ListBuffer[T] = { requireNonNull(tableName, clazz, gets)("参数不合法,无法进行HBase Get操作") tryWithReturn { this.getMaxVersions[T](gets: _*) val resultList = this.getResult(tableName, gets: _*) if (this.getMultiVersion[T]) this.hbaseMultiRow2Bean[T](resultList, clazz) else this.hbaseRow2Bean(resultList, clazz) }(this.logger, catchLog = s"批量 get ${hbaseClusterUrl(keyNum)}.${tableName}执行失败") } /** * 通过HBase Seq[Get]获取多条数据 * * @param tableName 表名 * @param getList HBase的get对象实例 * @return * HBase Result */ def getResult(tableName: String, getList: Get*): ListBuffer[Result] = { requireNonNull(tableName, getList)("参数不合法,执行HBase 批量get失败") var table: Table = null val list = ListBuffer[Result]() tryFinallyWithReturn { LineageManager.addDBDatasource("HBase", hbaseClusterUrl(keyNum), tableName, operation = FOperation.GET) table = this.getTable(tableName) list ++= table.get(getList) this.logger.info(s"HBase 批量get ${hbaseClusterUrl(keyNum)}.${tableName}执行成功, 总计${list.size}条") list } { this.closeTable(table) }(this.logger, "HBase get", s"get ${hbaseClusterUrl(keyNum)}.${tableName}执行失败", "close HBase table对象失败.") } /** * 通过HBase Get对象获取一条数据 * * @param tableName 表名 * @return * HBase Result */ def getResult[T: ClassTag](tableName: String, rowKeyList: String*): ListBuffer[Result] = { requireNonNull(tableName, rowKeyList)("参数不合法,rowKey集合不能为空.") val getList = for (rowKey <- rowKeyList) yield HBaseConnector.buildGet(rowKey) val starTime = currentTime val resultList = this.getResult(tableName, getList: _*) logger.info(s"HBase 批量get ${hbaseClusterUrl(keyNum)}.${tableName}执行成功, 总计${resultList.size}条, 耗时:${elapsed(starTime)}") resultList } /** * 表扫描,将scan后得到的ResultScanner对象直接返回 * 注:调用者需手动关闭ResultScanner对象实例 * * @param tableName 表名 * @param scan HBase scan对象 * @return 指定类型的List */ def scanResultScanner(tableName: String, scan: Scan): ResultScanner = { requireNonEmpty(tableName, scan)(s"参数不合法,scan ${hbaseClusterUrl(keyNum)}.${tableName}失败.") var table: Table = null var rsScanner: ResultScanner = null try { table = this.getTable(tableName) LineageManager.addDBDatasource("HBase", hbaseClusterUrl(keyNum), tableName, operation = FOperation.SCAN) rsScanner = table.getScanner(scan) } catch { case e: Exception => { // 当执行scan失败时,向上抛异常之前,避免ResultScanner对象因异常无法得到有效的关闭 // 因此在发生异常时会尝试关闭ResultScanner对象 logger.error(s"执行scan ${hbaseClusterUrl(keyNum)}.${tableName}失败", e) try { this.closeResultScanner(rsScanner) } finally { throw e } } } finally { this.closeTable(table) } rsScanner } /** * 表扫描,将scan后得到的ResultScanner对象直接返回 * 注:调用者需手动关闭ResultScanner对象实例 * * @param tableName 表名 * @param startRow 开始行 * @param endRow 结束行 * @return 指定类型的List */ def scanResultScanner(tableName: String, startRow: String, endRow: String): ResultScanner = { requireNonEmpty(tableName, startRow, endRow) val scan = HBaseConnector.buildScan(startRow, endRow) this.scanResultScanner(tableName, scan) } /** * 表扫描,将查询后的数据转为JavaBean并放到List中 * * @param tableName 表名 * @param startRow 开始行 * @param endRow 结束行 * @param clazz 类型 * @return 指定类型的List */ def scan[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, endRow: String): ListBuffer[T] = { requireNonEmpty(tableName, clazz, startRow, endRow) val scan = HBaseConnector.buildScan(startRow, endRow) this.scan[T](tableName, clazz, scan) } /** * 表扫描,将查询后的数据转为JavaBean并放到List中 * * @param tableName 表名 * @param scan HBase scan对象 * @param clazz 类型 * @return 指定类型的List */ def scan[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan): ListBuffer[T] = { requireNonEmpty(tableName, clazz, scan)(s"参数不合法,scan ${hbaseClusterUrl(keyNum)}.${tableName}失败.") val list = ListBuffer[T]() var rsScanner: ResultScanner = null tryFinallyWithReturn { this.setScanMaxVersions[T](scan) rsScanner = this.scanResultScanner(tableName, scan) if (rsScanner != null) { rsScanner.foreach(rs => { if (this.getMultiVersion[T]) { val objList = this.hbaseMultiRow2Bean[T](rs, clazz) if (objList != null && objList.nonEmpty) list ++= objList } else { val obj = hbaseRow2Bean(rs, clazz) if (obj.isDefined) list += obj.get } }) } this.logger.info(s"HBase scan ${hbaseClusterUrl(keyNum)}.${tableName}执行成功, 总计${list.size}条") list } { this.closeResultScanner(rsScanner) }(this.logger, "HBase scan", s"scan ${hbaseClusterUrl(keyNum)}.${tableName}执行失败", "关闭HBase table对象或ResultScanner失败") } /** * 获取Configuration实例 * * @return HBase Configuration对象 */ def getConfiguration: Configuration = this.configuration /** * 用于初始化全局唯一的HBase connection */ @Internal def initConnection: Connection = { tryWithReturn { ConnectionFactory.createConnection(this.getConfiguration) }(logger, s"成功创建HBase ${hbaseClusterUrl(keyNum)}集群connection.", s"获取HBase ${hbaseClusterUrl(keyNum)}集群connection失败.") } /** * 根据keyNum获取指定HBase集群的connection */ def getConnection: Connection = this.connection /** * 将class中的field转为map映射 * * @param clazz Class类型 * @return 名称与字段的映射map */ @Internal private[this] def getFieldNameMap[T <: HBaseBaseBean[T]](clazz: Class[T]): JMap[String, Field] = { if (!this.cacheFieldMap.containsKey(clazz)) { val allFields = ReflectionUtils.getAllFields(clazz) if (allFields != null) { val fieldMap = Maps.newHashMapWithExpectedSize[String, Field](allFields.size()) if (allFields != null) { allFields.values.filter(_ != null).foreach(field => { val fieldName = field.getAnnotation(classOf[FieldName]) var family = "" var qualifier = "" if (fieldName != null) { family = fieldName.family qualifier = fieldName.value } if (StringUtils.isBlank(family)) family = familyName(keyNum) if (StringUtils.isBlank(qualifier)) qualifier = field.getName fieldMap.put(family + ":" + qualifier, field) }) } cacheFieldMap.put(clazz, fieldMap) } } this.cacheFieldMap.get(clazz) } /** * 为指定对象的field赋值 * * @param obj 目标对象 * @param field 指定filed * @param value byte类型的数据 */ @Internal private def setFieldBytesValue[T <: HBaseBaseBean[T]](obj: T, field: Field, value: Array[Byte]): Unit = { tryWithLog { if (field != null && value != null && value.nonEmpty) { ReflectionUtils.setAccessible(field) val toValue = field.getType match { case fieldType if fieldType eq classOf[JString] => Bytes.toString(value) case fieldType if fieldType eq classOf[JInt] => Bytes.toInt(value) case fieldType if fieldType eq classOf[JDouble] => Bytes.toDouble(value) case fieldType if fieldType eq classOf[JLong] => Bytes.toLong(value) case fieldType if fieldType eq classOf[JBigDecimal] => Bytes.toBigDecimal(value) case fieldType if fieldType eq classOf[JFloat] => Bytes.toFloat(value) case fieldType if fieldType eq classOf[JBoolean] => Bytes.toBoolean(value) case fieldType if fieldType eq classOf[JShort] => Bytes.toShort(value) } field.set(obj, toValue) } else if (field != null) field.set(obj, null) }(this.logger, catchLog = s"为filed ${field}设置赋值过程中出现异常") } /** * 将含有多版本的cell映射为field * * @param rs hbase 结果集 * @param clazz 目标类型 * @param fieldMap 字段映射信息 */ @Internal private[this] def multiCell2Field[T <: HBaseBaseBean[T] : ClassTag](rs: Result, clazz: Class[T], fieldMap: JMap[String, Field]): ListBuffer[T] = { val objList = ListBuffer[T]() tryWithLog { if (rs != null) { rs.rawCells.filter(_ != null).foreach(cell => { val obj = new MultiVersionsBean val rowKey = new String(CellUtil.cloneRow(cell), StandardCharsets.UTF_8) val family = new String(CellUtil.cloneFamily(cell), StandardCharsets.UTF_8) val qualifier = new String(CellUtil.cloneQualifier(cell), StandardCharsets.UTF_8) val value = CellUtil.cloneValue(cell) val field = fieldMap.get(family + ":" + qualifier) this.setFieldBytesValue(obj, field, value) val idField = ReflectionUtils.getFieldByName(clazz, "rowKey") requireNonEmpty(idField)(s"${clazz}中必须有名为rowKey的成员变量") idField.set(obj, rowKey) if (StringUtils.isNotBlank(obj.getMultiFields)) objList.add(JSONUtils.parseObject[T](obj.getMultiFields)) }) } }(this.logger, catchLog = s"将多版本json数据转为类型${clazz}过程中发生失败.") objList } /** * 将cell中的值转为File的值 * * @param clazz 类类型 * @param fieldMap 成员变量信息 * @param rs hbase查询结果集 * @return clazz对应的结果实例 */ @Internal private[this] def cell2Field[T <: HBaseBaseBean[T]](clazz: Class[T], fieldMap: JMap[String, Field], rs: Result): Option[T] = { val cells = rs.rawCells if (cells == null) return None val obj = clazz.newInstance tryWithLog { val rowKey = convertCells2Fields(fieldMap, obj, cells) val idField = ReflectionUtils.getFieldByName(clazz, "rowKey") requireNonEmpty(idField)(s"${clazz}中必须有名为rowKey的成员变量") ReflectionUtils.setAccessible(idField) idField.set(obj, rowKey) }(this.logger, catchLog = "将HBase cell中的值转换并赋值给field过程中报错.") Some(obj) } /** * 一次循环取出cell中的值赋值给各个field * * @param obj 对象实例 * @param cells hbase结果集中的cells集合 * @return rowkey */ @Internal private[this] def convertCells2Fields[T <: HBaseBaseBean[T]](fieldMap: JMap[String, Field], obj: T, cells: Array[Cell]): String = { requireNonEmpty(fieldMap, obj) var rowKey = "" if (cells != null) { cells.filter(_ != null).foreach(cell => { rowKey = new String(CellUtil.cloneRow(cell), StandardCharsets.UTF_8) val family = new String(CellUtil.cloneFamily(cell), StandardCharsets.UTF_8) val qualifier = new String(CellUtil.cloneQualifier(cell), StandardCharsets.UTF_8) val value = CellUtil.cloneValue(cell) val field = fieldMap.get(family + ":" + qualifier) this.setFieldBytesValue(obj, field, value) }) } rowKey } /** * 将结果映射到自定义bean中 * * @param rs HBase查询结果集 * @param clazz 映射的目标Class类型 * @return 目标类型实例 */ @Internal private[fire] def hbaseRow2Bean[T <: HBaseBaseBean[T]](rs: Result, clazz: Class[T]): Option[T] = { requireNonNull(rs, clazz)("参数不合法,HBase Row转为JavaBean失败.") val fieldMap = this.getFieldNameMap(clazz) requireNonEmpty(fieldMap)(s"${clazz}中未声明任何成员变量或成员变量未声明注解@FieldName") this.cell2Field(clazz, fieldMap, rs) } /** * 将结果映射到自定义bean中 * * @param rsArr HBase查询结果集 * @param clazz 映射的目标Class类型 * @return 目标类型实例 */ @Internal private[fire] def hbaseRow2Bean[T <: HBaseBaseBean[T]](rsArr: ListBuffer[Result], clazz: Class[T]): ListBuffer[T] = { requireNonNull(rsArr, clazz)("参数不合法,HBase Row转为JavaBean失败.") val fieldMap = this.getFieldNameMap(clazz) requireNonEmpty(fieldMap)(s"${clazz}中未声明任何成员变量或成员变量未声明注解@FieldName") val objList = ListBuffer[T]() rsArr.filter(rs => rs != null && !rs.isEmpty).foreach(rs => { val obj = this.cell2Field(clazz, fieldMap, rs) if (obj.isDefined) objList += obj.get }) objList } /** * 将结果映射到自定义bean中 * * @param rs HBase查询结果集 * @param clazz 映射的目标Class类型 * @return 目标类型实例 */ @Internal private[fire] def hbaseMultiRow2Bean[T <: HBaseBaseBean[T] : ClassTag](rs: Result, clazz: Class[T]): ListBuffer[T] = { requireNonNull(rs, clazz)("参数不合法,HBase MultiRow转为JavaBean失败.") val fieldMap = this.getFieldNameMap(classOf[MultiVersionsBean]) requireNonEmpty(fieldMap)(s"${clazz}中未声明任何成员变量或成员变量未声明注解@FieldName") this.multiCell2Field[T](rs, clazz, fieldMap) } /** * 将结果映射到自定义bean中 * * @param rsArr HBase查询结果集 * @param clazz 映射的目标Class类型 * @return 目标类型实例 */ @Internal private[fire] def hbaseMultiRow2Bean[T <: HBaseBaseBean[T] : ClassTag](rsArr: ListBuffer[Result], clazz: Class[T]): ListBuffer[T] = { requireNonNull(rsArr, clazz)("参数不合法,HBase Row转为JavaBean失败.") val fieldMap = getFieldNameMap(classOf[MultiVersionsBean]) requireNonEmpty(fieldMap)(s"${clazz}中未声明任何成员变量或成员变量未声明注解@FieldName") val objList = ListBuffer[T]() rsArr.filter(rs => rs != null && !rs.isEmpty).foreach(rs => objList ++= this.multiCell2Field[T](rs, clazz, fieldMap)) objList } /** * 将结果映射到自定义bean中 * * @param it HBase查询结果集 * @param clazz 映射的目标Class类型 * @return 目标类型实例 */ @Internal private[fire] def hbaseRow2BeanList[T <: HBaseBaseBean[T]](it: Iterator[(ImmutableBytesWritable, Result)], clazz: Class[T]): Iterator[T] = { requireNonNull(it, clazz) val fieldMap = this.getFieldNameMap(clazz) requireNonEmpty(fieldMap)(s"${clazz}中未声明任何成员变量或成员变量未声明注解@FieldName") val beanList = ListBuffer[T]() tryWithLog { it.foreach(t => { val cells = t._2.rawCells() if (cells != null) { val obj = clazz.newInstance() val rowKey = this.convertCells2Fields(fieldMap, obj, cells) val idField = ReflectionUtils.getFieldByName(clazz, "rowKey") requireNonEmpty(idField)(s"${clazz}中必须有名为rowKey的成员变量") idField.set(obj, rowKey) beanList += obj } }) }(this.logger, catchLog = "执行hbaseRow2BeanList过程中出现异常") beanList.iterator } /** * 将多版本结果映射到自定义bean中 * * @param it HBase查询结果集 * @param clazz 映射的目标Class类型 * @return 目标类型实例 */ @Internal private[fire] def hbaseMultiVersionRow2BeanList[T <: HBaseBaseBean[T] : ClassTag](it: Iterator[(ImmutableBytesWritable, Result)], clazz: Class[T]): Iterator[T] = { requireNonNull(it, clazz) val beanList = ListBuffer[T]() tryWithLog { it.foreach(t => { beanList ++= this.hbaseMultiRow2Bean[T](t._2, clazz) }) }(this.logger, catchLog = "将HBase多版本Row转为JavaBean过程中出现异常.") beanList.iterator } /** * 将Javabean转为put对象 * * @param obj 对象 * @param insertEmpty true:插入null字段,false:不插入空字段 * @return put对象实例 */ @Internal private[fire] def convert2Put[T <: HBaseBaseBean[T]](obj: T, insertEmpty: Boolean): Put = { requireNonEmpty(obj, insertEmpty)("参数不能为空,无法将对象转为HBase Put对象") tryWithReturn { var tmpObj = obj val clazz = tmpObj.getClass val rowKeyField = ReflectionUtils.getFieldByName(clazz, "rowKey") var rowKeyObj = rowKeyField.get(tmpObj) if (rowKeyObj == null) { val method = ReflectionUtils.getMethodByName(clazz, "buildRowKey") tmpObj = method.invoke(tmpObj).asInstanceOf[T] rowKeyObj = rowKeyField.get(tmpObj) requireNonEmpty(rowKeyObj)(s"rowKey不能为空,请检查${clazz}中是否实现buildRowKey()方法!") } val allFields = ReflectionUtils.getAllFields(clazz) requireNonEmpty(allFields)(s"在${clazz}中未找到任何成员变量,请检查!") val rowKey = rowKeyObj.toString.getBytes(StandardCharsets.UTF_8) val put = new Put(rowKey) put.setDurability(this.durability) allFields.values().foreach(field => { val objValue = field.get(obj) // 将objValue插入的两种情况:1. 允许插入为空的值;2. 不允许插入为空的值,并且objValue不为空 if (insertEmpty || (!insertEmpty && objValue != null)) { val fieldName = field.getAnnotation(classOf[FieldName]) var name = "" var familyName = "" if (fieldName != null && !fieldName.disuse) { familyName = fieldName.family name = fieldName.value } // 如果未声明@FieldName注解或者声明了@FieldName注解但同时在注解中的disuse指定为false,则进行字段的转换 // 如果不满足以上两个条件,则任务当前字段不需要转为Put对象中的qualifier if (fieldName == null || (fieldName != null && !fieldName.disuse())) { if (StringUtils.isBlank(familyName)) familyName = FireHBaseConf.familyName(keyNum) if (StringUtils.isBlank(name)) name = field.getName val famliyByte = familyName.getBytes(StandardCharsets.UTF_8) val qualifierByte = name.getBytes(StandardCharsets.UTF_8) if (objValue != null) { val objValueStr = objValue.toString val toBytes = field.getType match { case fieldType if fieldType eq classOf[JString] => Bytes.toBytes(objValueStr) case fieldType if fieldType eq classOf[JInt] => Bytes.toBytes(JInt.parseInt(objValueStr)) case fieldType if fieldType eq classOf[JDouble] => Bytes.toBytes(JDouble.parseDouble(objValueStr)) case fieldType if fieldType eq classOf[JLong] => Bytes.toBytes(JLong.parseLong(objValueStr)) case fieldType if fieldType eq classOf[JBigDecimal] => Bytes.toBytes(new JBigDecimal(objValueStr)) case fieldType if fieldType eq classOf[JFloat] => Bytes.toBytes(JFloat.parseFloat(objValueStr)) case fieldType if fieldType eq classOf[JBoolean] => Bytes.toBytes(JBoolean.parseBoolean(objValueStr)) case fieldType if fieldType eq classOf[JShort] => Bytes.toBytes(JShort.parseShort(objValueStr)) } put.addColumn(famliyByte, qualifierByte, toBytes) } else { put.addColumn(famliyByte, qualifierByte, null) } } } }) put }(this.logger, catchLog = "将JavaBean转为HBase Put对象过程中出现异常.") } /** * 提供给fire-spark引擎的工具方法 * * @param obj 继承自HBaseBaseBean的子类实例 * @return HBaseBaseBean的子类实例 */ @Internal private[fire] def convert2PutTuple[T <: HBaseBaseBean[T]](obj: T, insertEmpty: Boolean = true): (ImmutableBytesWritable, Put) = { (new ImmutableBytesWritable(), convert2Put(obj, insertEmpty)) } /** * 获取类注解HConfig中的nullable */ @Internal private[fire] def getNullable[T <: HBaseBaseBean[T] : ClassTag]: Boolean = { val hConfig = this.getHConfig[T] if (hConfig == null) return true hConfig.nullable() } /** * 获取类注解HConfig中的multiVersion */ @Internal private[fire] def getMultiVersion[T <: HBaseBaseBean[T] : ClassTag]: Boolean = { val hConfig = this.getHConfig[T] if (hConfig == null) return false hConfig.multiVersion() } /** * 获取类上声明的HConfig注解 */ @Internal private[fire] def getHConfig[T <: HBaseBaseBean[T] : ClassTag]: HConfig = { val clazz = classTag[T].runtimeClass if (!this.cacheHConfigMap.containsKey(clazz)) { val hConfig = clazz.getAnnotation(classOf[HConfig]) if (hConfig != null) { this.cacheHConfigMap.put(clazz, hConfig) } } this.cacheHConfigMap.get(clazz) } /** * 根据keyNum获取对应配置的durability */ @Internal private[this] def initDurability: Durability = { val hbaseDurability = FireHBaseConf.hbaseDurability(keyNum) // 将匹配到的配置转为Durability对象 hbaseDurability.toUpperCase match { case "ASYNC_WAL" => Durability.ASYNC_WAL case "FSYNC_WAL" => Durability.FSYNC_WAL case "SKIP_WAL" => Durability.SKIP_WAL case "SYNC_WAL" => Durability.SYNC_WAL case _ => Durability.USE_DEFAULT } } /** * 创建HBase表 * * @param tableName * 表名 * @param families * 列族 */ private[fire] def createTable(tableName: String, families: String*): Unit = { requireNonEmpty(tableName, families)("执行createTable失败") var admin: Admin = null tryFinallyWithReturn { admin = this.getConnection.getAdmin val tbName = TableName.valueOf(tableName) if (!admin.tableExists(tbName)) { val tableDesc = new HTableDescriptor(tbName) // 在描述里添加列族 for (columnFamily <- families) { val desc = new HColumnDescriptor(columnFamily) // 启用压缩 desc.setCompressionType(Compression.Algorithm.SNAPPY) tableDesc.addFamily(desc) } admin.createTable(tableDesc) LineageManager.addDBDatasource("HBase", hbaseClusterUrl(keyNum), tableName, operation = FOperation.CREATE_TABLE) // 如果开启表缓存,则更新缓存信息 if (this.tableExistsCacheEnable && this.tableExists(tableName)) this.cacheTableExistsMap.update(tableName, true) } } { this.closeAdmin(admin) }(logger, s"HBase createTable ${hbaseClusterUrl(keyNum)}.${tableName}执行成功", s"创建HBase表${hbaseClusterUrl(keyNum)}.${tableName}失败.", closeAdminError) } /** * 删除指定的HBase表 * * @param tableName 表名 */ private[fire] def dropTable(tableName: String): Unit = { requireNonEmpty(tableName)("执行dropTable失败") var admin: Admin = null tryFinallyWithReturn { admin = this.getConnection.getAdmin val tbName = TableName.valueOf(tableName) if (admin.tableExists(tbName)) { admin.disableTable(tbName) admin.deleteTable(tbName) // 如果开启表缓存,则更新缓存信息 if (this.tableExistsCacheEnable && !this.tableExists(tableName)) this.cacheTableExistsMap.update(tableName, false) LineageManager.addDBDatasource("HBase", hbaseClusterUrl(keyNum), tableName, operation = FOperation.DROP_TABLE) } } { this.closeAdmin(admin) }(this.logger, s"HBase createTable ${hbaseClusterUrl(keyNum)}.${tableName}执行成功", s"drop ${hbaseClusterUrl(keyNum)}.${tableName}表操作失败", closeAdminError) } /** * 启用指定的HBase表 * * @param tableName 表名 */ private[fire] def enableTable(tableName: String): Unit = { requireNonEmpty(tableName)("执行enableTable失败") var admin: Admin = null tryFinallyWithReturn { admin = this.getConnection.getAdmin val tbName = TableName.valueOf(tableName) if (admin.tableExists(tbName) && !admin.isTableEnabled(tbName)) { admin.enableTable(tbName) LineageManager.addDBDatasource("HBase", hbaseClusterUrl(keyNum), tableName, operation = FOperation.ENABLE_TABLE) } } { this.closeAdmin(admin) }(this.logger, s"HBase enableTable ${hbaseClusterUrl(keyNum)}.${tableName}执行成功", s"enable ${hbaseClusterUrl(keyNum)}.${tableName}表失败", closeAdminError) } /** * disable指定的HBase表 * * @param tableName 表名 */ private[fire] def disableTable(tableName: String): Unit = { requireNonEmpty(tableName)("执行disableTable失败") var admin: Admin = null tryFinallyWithReturn { admin = this.getConnection.getAdmin val tbName = TableName.valueOf(tableName) if (admin.tableExists(tbName) && admin.isTableEnabled(tbName)) { admin.disableTable(tbName) LineageManager.addDBDatasource("HBase", hbaseClusterUrl(keyNum), tableName, operation = FOperation.DISABLE_TABLE) } } { this.closeAdmin(admin) }(this.logger, s"HBase disableTable ${hbaseClusterUrl(keyNum)}.${tableName}执行成功", s"disable ${hbaseClusterUrl(keyNum)}.${tableName}表失败", closeAdminError) } /** * 清空指定的HBase表 * * @param tableName HBase表名 * @param preserveSplits 是否保留所有的split信息 */ private[fire] def truncateTable(tableName: String, preserveSplits: Boolean = true): Unit = { requireNonEmpty(tableName, preserveSplits)("执行truncateTable失败") var admin: Admin = null tryFinallyWithReturn { admin = this.getConnection.getAdmin val tbName = TableName.valueOf(tableName) if (admin.tableExists(tbName)) { this.disableTable(tableName) admin.truncateTable(tbName, preserveSplits) LineageManager.addDBDatasource("HBase", hbaseClusterUrl(keyNum), tableName, operation = FOperation.TRUNCATE) } } { this.closeAdmin(admin) }(this.logger, s"HBase truncateTable ${hbaseClusterUrl(keyNum)}.${tableName}执行成功", s"truncate ${hbaseClusterUrl(keyNum)}.${tableName}表失败", closeAdminError) } /** * 释放对象 * * @param admin admin对象实例 */ @Internal private[this] def closeAdmin(admin: Admin): Unit = { tryWithLog { if (admin != null) admin.close() }(logger, catchLog = "关闭HBase admin对象失败") } /** * 关闭ResultScanner对象 */ @Internal private[this] def closeResultScanner(rs: ResultScanner): Unit = { tryWithLog { if (rs != null) rs.close() }(this.logger, catchLog = "关闭ResultScanner对象失败", isThrow = false) } /** * 关闭table对象 */ def closeTable(table: Table): Unit = { tryWithLog { if (table != null) table.close() }(logger, catchLog = "关闭HBase table对象失败", isThrow = true) } /** * 根据表名获取Table实例 * * @param tableName 表名 */ def getTable(tableName: String): Table = { tryWithReturn { require(this.isExists(tableName), s"表${tableName}不存在,请检查") this.getConnection.getTable(TableName.valueOf(tableName)) }(logger, catchLog = s"HBase getTable操作失败. ${hbaseClusterUrl(keyNum)}.${tableName}") } /** * 判断给定的表名是否存在 * * @param tableName * HBase表名 */ def isExists(tableName: String): Boolean = { if (StringUtils.isBlank(tableName)) return false if (this.tableExistsCacheEnable) { // 如果走缓存 if (!this.cacheTableExistsMap.containsKey(tableName)) { this.logger.debug(s"已缓存${tableName}是否存在信息,后续将走缓存.") this.cacheTableExistsMap.put(tableName, this.tableExists(tableName)) } this.cacheTableExistsMap.get(tableName) } else { // 不走缓存则每次连接HBase获取表是否存在的信息 this.tableExists(tableName) } } /** * 用于判断HBase表是否存在 * 注:内部api,每次需连接HBase获取表信息 */ @Internal private[fire] def tableExists(tableName: String): Boolean = { if (StringUtils.isBlank(tableName)) return false var admin: Admin = null tryFinallyWithReturn { admin = this.getConnection.getAdmin val isExists = admin.tableExists(TableName.valueOf(tableName)) this.logger.debug(s"HBase tableExists ${hbaseClusterUrl(keyNum)}.${tableName}获取成功") isExists } { closeAdmin(admin) }(logger, catchLog = s"判断HBase表${hbaseClusterUrl(keyNum)}.${tableName}是否存在失败") } /** * 根据多个rowKey删除对应的整行记录 * * @param tableName 表名 * @param rowKeys 待删除的rowKey集合 */ def deleteRows(tableName: String, rowKeys: String*): Unit = { if (noEmpty(tableName, rowKeys)) { var table: Table = null tryFinallyWithReturn { table = this.getTable(tableName) val deletes = ListBuffer[Delete]() rowKeys.filter(StringUtils.isNotBlank).foreach(rowKey => { deletes += new Delete(rowKey.getBytes(StandardCharsets.UTF_8)) }) table.delete(deletes) LineageManager.addDBDatasource("HBase", hbaseClusterUrl(keyNum), tableName, operation = FOperation.DELETE) } { this.closeTable(table) }(this.logger, s"HBase deleteRows ${hbaseClusterUrl(keyNum)}.${tableName}执行成功", s"执行${tableName}表rowKey删除失败", "close HBase table对象失败") } } /** * 批量删除指定RowKey的多个列族 * * @param tableName 表名 * @param rowKey rowKey * @param families 多个列族 */ @Internal private[fire] def deleteFamilies(tableName: String, rowKey: String, families: String*): Unit = { if (noEmpty(tableName, rowKey, families)) { val delete = new Delete(rowKey.getBytes(StandardCharsets.UTF_8)) families.filter(StringUtils.isNotBlank).foreach(family => delete.addFamily(family.getBytes(StandardCharsets.UTF_8))) var table: Table = null tryFinallyWithReturn { table = this.getTable(tableName) table.delete(delete) LineageManager.addDBDatasource("HBase", hbaseClusterUrl(keyNum), tableName, operation = FOperation.DELETE_FAMILY) } { this.closeTable(table) }(this.logger, s"HBase deleteFamilies ${hbaseClusterUrl(keyNum)}.${tableName}执行成功", s"delete ${hbaseClusterUrl(keyNum)}.${tableName} families failed. RowKey is ${rowKey}, families is ${families}", "close HBase table对象出现异常.") } } /** * 批量删除指定列族下的多个字段 * * @param tableName 表名 * @param rowKey rowKey字段 * @param family 列族 * @param qualifiers 列名 */ @Internal private[fire] def deleteQualifiers(tableName: String, rowKey: String, family: String, qualifiers: String*): Unit = { if (noEmpty(tableName, rowKey, family, qualifiers)) { val delete = new Delete(rowKey.getBytes(StandardCharsets.UTF_8)) qualifiers.foreach(qualifier => delete.addColumns(family.getBytes(StandardCharsets.UTF_8), qualifier.getBytes(StandardCharsets.UTF_8))) var table: Table = null tryFinallyWithReturn { table = this.getTable(tableName) table.delete(delete) LineageManager.addDBDatasource("HBase", hbaseClusterUrl(keyNum), tableName, operation = FOperation.DELETE_QUALIFIER) } { this.closeTable(table) }(this.logger, s"HBase deleteQualifiers ${hbaseClusterUrl(keyNum)}.${tableName}执行成功", s"delete ${hbaseClusterUrl(keyNum)}.${tableName} qualifiers failed. RowKey is ${rowKey}, qualifiers is ${qualifiers}", "close HBase table对象出现异常.") } } /** * 用于定时reload表是否存在的数据 */ @Internal private[this] def registerReload(): Unit = { if (tableExistsCacheReload(this.keyNum)) { ThreadUtils.scheduleWithFixedDelay({ val start = currentTime cacheTableExistsMap.foreach(kv => { cacheTableExistsMap.update(kv._1, tableExists(kv._1)) // 将用到的表信息加入到数据源管理器中 logger.debug(s"定时reload HBase表:${kv._1} 信息成功.") }) logger.debug(s"定时reload HBase耗时:${elapsed(start)}") }, tableExistCacheInitialDelay(this.keyNum), tableExistCachePeriod(this.keyNum), TimeUnit.SECONDS) } } /** * 用于初始化单例的configuration */ @Internal override protected[fire] def open(): Unit = { val finalConf = if (this.conf != null) this.conf else HBaseConfiguration.create() val url = hbaseClusterUrl(keyNum) if (StringUtils.isNotBlank(url)) finalConf.set("hbase.zookeeper.quorum", url) // 以spark.fire.hbase.conf.xxx[keyNum]开头的配置信息 PropUtils.sliceKeysByNum(hbaseConfPrefix, keyNum).foreach(kv => { logger.info(s"hbase configuration: key=${kv._1} value=${kv._2}") finalConf.set(kv._1, kv._2) }) requireNonEmpty(finalConf.get("hbase.zookeeper.quorum"))(s"未配置HBase集群信息,请通过以下参数指定:spark.hbase.cluster[$keyNum]=xxx") this.configuration = finalConf } /** * connector关闭 */ override protected def close(): Unit = { if (this.connection != null && !this.connection.isClosed) { this.connection.close() logger.debug(s"释放HBase connection成功. keyNum=$keyNum") } } /** * 获取HBaseBaseBean子类@HConfig中的versions的值 */ @Internal private[this] def getVersions[T <: HBaseBaseBean[T] : ClassTag]: Int = { val clazz = getParamType[T] val hConfig = ReflectionUtils.getClassAnnotation(clazz, classOf[HConfig]) // 仅当开启多版本的情况下versions的值才有效 if (hConfig == null || !this.getMultiVersion[T]) 1 else hConfig.asInstanceOf[HConfig].versions } /** * 为Get对象设置获取最大的版本数 */ @Internal private[fire] def getMaxVersions[T <: HBaseBaseBean[T] : ClassTag](gets: Get*): Unit = { val versions = this.getVersions[T] if (this.getMultiVersion[T] && versions > 1) gets.foreach(get => get.setMaxVersions(versions)) } /** * 为Scan对象设置获取最大的版本数 */ @Internal private[fire] def setScanMaxVersions[T <: HBaseBaseBean[T] : ClassTag](scan: Scan): Unit = { val versions = this.getVersions[T] if (this.getMultiVersion[T] && versions > 1) scan.setMaxVersions(versions) } } /** * 用于单例构建伴生类HBaseConnector的实例对象 * 每个HBaseConnector实例使用keyNum作为标识,并且与每个HBase集群一一对应 */ object HBaseConnector extends ConnectorFactory[HBaseConnector] with HBaseFunctions { /** * 创建HBaseConnector */ override protected def create(conf: Any = null, keyNum: Int = 1): HBaseConnector = { requireNonEmpty(keyNum) val connector = new HBaseConnector(conf.asInstanceOf[Configuration], keyNum) logger.debug(s"创建HBaseConnector实例成功. keyNum=$keyNum") connector } } ================================================ FILE: fire-connectors/base-connectors/fire-hbase/src/main/scala/com/zto/fire/hbase/HBaseFunctions.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.hbase import java.nio.charset.StandardCharsets import com.zto.fire.predef._ import com.zto.fire.common.anno.Internal import com.zto.fire.hbase.bean.HBaseBaseBean import org.apache.commons.lang3.StringUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase.client.{Connection, Get, Put, Result, ResultScanner, Scan} import org.apache.hadoop.hbase.filter.{Filter, FilterList} import scala.collection.mutable.ListBuffer import scala.reflect.ClassTag /** * HBase API库 * * @author ChengLong * @since 2.0.0 * @create 2020-12-23 15:44 */ private[hbase] trait HBaseFunctions { /** * 构建Get对象 * * @param rowKey rowKey * @param family 列族名称 * @param qualifier 表的qualifier名称 */ def buildGet(rowKey: String, family: String = null, qualifier: String = "", maxVersions: Int = 1, filter: Filter = null): Get = { require(StringUtils.isNotBlank(rowKey), "buildGet执行失败,rowKey不能为空!") val get = new Get(rowKey.getBytes(StandardCharsets.UTF_8)) if (StringUtils.isNotBlank(family) && StringUtils.isNotBlank(qualifier)) { get.addColumn(family.getBytes(StandardCharsets.UTF_8), qualifier.getBytes(StandardCharsets.UTF_8)) } else if (StringUtils.isNotBlank(family)) { get.addFamily(family.getBytes(StandardCharsets.UTF_8)) } if (filter != null) get.setFilter(filter) if (maxVersions > 0) get.setMaxVersions(maxVersions) get } /** * 构建Scan对象 * * @param startRow 指定起始rowkey * @param endRow 指定结束rowkey * @param filterList 过滤器 * @return scan实例 */ def buildScan(startRow: String, endRow: String, family: String = null, qualifier: String = "", maxVersions: Int = 1, filterList: FilterList = null, batch: Int = -1): Scan = { val scan = new Scan if (StringUtils.isNotBlank(startRow)) scan.setStartRow(startRow.getBytes(StandardCharsets.UTF_8)) if (StringUtils.isNotBlank(endRow)) scan.setStopRow(endRow.getBytes(StandardCharsets.UTF_8)) if (StringUtils.isNotBlank(family) && StringUtils.isNotBlank(qualifier)) { scan.addColumn(family.getBytes(StandardCharsets.UTF_8), qualifier.getBytes(StandardCharsets.UTF_8)) } else if (StringUtils.isNotBlank(family)) { scan.addFamily(family.getBytes(StandardCharsets.UTF_8)) } if (filterList != null) scan.setFilter(filterList) if (maxVersions > 0) scan.setMaxVersions(maxVersions) if (batch > 0) scan.setBatch(batch) scan } /** * 批量插入多行多列,自动将HBaseBaseBean子类转为Put集合 * * @param tableName 表名 * @param beans HBaseBaseBean子类集合 */ def insert[T <: HBaseBaseBean[T] : ClassTag](tableName: String, beans: Seq[T], keyNum: Int = 1): Unit = { HBaseConnector(keyNum = keyNum).insert[T](tableName, beans: _*) } /** * 批量插入多行多列 * * @param tableName 表名 * @param puts Put集合 */ def insert(tableName: String, puts: Seq[Put], keyNum: Int): Unit = { HBaseConnector(keyNum = keyNum).insert(tableName, puts: _*) } /** * 从HBase批量Get数据,并将结果封装到JavaBean中 * * @param tableName 表名 * @param rowKeys 指定的多个rowKey * @param clazz 目标类类型,必须是HBaseBaseBean的子类 * @return 目标对象实例 */ def get[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], rowKeys: Seq[String], keyNum: Int = 1): ListBuffer[T] = { HBaseConnector(keyNum = keyNum).get[T](tableName, clazz, rowKeys: _*) } /** * 从HBase批量Get数据,并将结果封装到JavaBean中 * * @param tableName 表名 * @param clazz 目标类类型,必须是HBaseBaseBean的子类 * @param gets 指定的多个get对象 * @return 目标对象实例 */ def get[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], gets: ListBuffer[Get], keyNum: Int): ListBuffer[T] = { HBaseConnector(keyNum = keyNum).get[T](tableName, clazz, gets: _*) } /** * 通过HBase Seq[Get]获取多条数据 * * @param tableName 表名 * @param getList HBase的get对象实例 * @return * HBase Result */ def getResult(tableName: String, getList: Seq[Get], keyNum: Int): ListBuffer[Result] = { HBaseConnector(keyNum = keyNum).getResult(tableName, getList: _*) } /** * 通过HBase Get对象获取一条数据 * * @param tableName 表名 * @return * HBase Result */ def getResult[T: ClassTag](tableName: String, rowKeyList: Seq[String], keyNum: Int = 1): ListBuffer[Result] = { HBaseConnector(keyNum = keyNum).getResult[T](tableName, rowKeyList: _*) } /** * 表扫描,将scan后得到的ResultScanner对象直接返回 * 注:调用者需手动关闭ResultScanner对象实例 * * @param tableName 表名 * @param scan HBase scan对象 * @return 指定类型的List */ def scanResultScanner(tableName: String, scan: Scan, keyNum: Int): ResultScanner = { HBaseConnector(keyNum = keyNum).scanResultScanner(tableName, scan) } /** * 表扫描,将scan后得到的ResultScanner对象直接返回 * 注:调用者需手动关闭ResultScanner对象实例 * * @param tableName 表名 * @param startRow 开始行 * @param endRow 结束行 * @return 指定类型的List */ def scanResultScanner(tableName: String, startRow: String, endRow: String, keyNum: Int = 1): ResultScanner = { HBaseConnector(keyNum = keyNum).scanResultScanner(tableName, startRow, endRow) } /** * 表扫描,将查询后的数据转为JavaBean并放到List中 * * @param tableName 表名 * @param startRow 开始行 * @param endRow 结束行 * @param clazz 类型 * @return 指定类型的List */ def scan[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, endRow: String, keyNum: Int = 1): ListBuffer[T] = { HBaseConnector(keyNum = keyNum).scan[T](tableName, clazz, startRow, endRow) } /** * 表扫描,将查询后的数据转为JavaBean并放到List中 * * @param tableName 表名 * @param scan HBase scan对象 * @param clazz 类型 * @return 指定类型的List */ def scan[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan, keyNum: Int): ListBuffer[T] = { HBaseConnector(keyNum = keyNum).scan[T](tableName, clazz, scan) } /** * 根据keyNum获取指定HBase集群的connection */ def getConnection(keyNum: Int = 1): Connection = HBaseConnector(keyNum = keyNum).getConnection /** * 创建HBase表 * * @param tableName * 表名 * @param families * 列族 */ private[fire] def createTable(tableName: String, families: Seq[String], keyNum: Int = 1): Unit = { HBaseConnector(keyNum = keyNum).createTable(tableName, families: _*) } /** * 删除指定的HBase表 * * @param tableName 表名 */ private[fire] def dropTable(tableName: String, keyNum: Int = 1): Unit = { HBaseConnector(keyNum = keyNum).dropTable(tableName) } /** * 启用指定的HBase表 * * @param tableName 表名 */ private[fire] def enableTable(tableName: String, keyNum: Int = 1): Unit = { HBaseConnector(keyNum = keyNum).enableTable(tableName) } /** * disable指定的HBase表 * * @param tableName 表名 */ private[fire] def disableTable(tableName: String, keyNum: Int = 1): Unit = { HBaseConnector(keyNum = keyNum).disableTable(tableName) } /** * 清空指定的HBase表 * * @param tableName * 表名 * @param preserveSplits 是否保留所有的split信息 */ private[fire] def truncateTable(tableName: String, preserveSplits: Boolean = true, keyNum: Int = 1): Unit = { HBaseConnector(keyNum = keyNum).truncateTable(tableName, preserveSplits) } /** * 用于判断HBase表是否存在 */ def tableExists(tableName: String, keyNum: Int = 1): Boolean = { HBaseConnector(keyNum = keyNum).tableExists(tableName) } /** * 用于判断HBase表是否存在(走缓存) */ def isExists(tableName: String, keyNum: Int = 1): Boolean = { HBaseConnector(keyNum = keyNum).isExists(tableName) } /** * 根据多个rowKey删除对应的整行记录 * * @param tableName 表名 * @param rowKeys 待删除的rowKey集合 */ def deleteRows(tableName: String, rowKeys: Seq[String], keyNum: Int = 1): Unit = { HBaseConnector(keyNum = keyNum).deleteRows(tableName, rowKeys: _*) } /** * 批量删除指定RowKey的多个列族 * * @param tableName 表名 * @param rowKey rowKey * @param families 多个列族 */ @Internal private[fire] def deleteFamilies(tableName: String, rowKey: String, families: Seq[String], keyNum: Int = 1): Unit = { HBaseConnector(keyNum = keyNum).deleteFamilies(tableName, rowKey, families: _*) } /** * 批量删除指定列族下的多个字段 * * @param tableName 表名 * @param rowKey rowKey字段 * @param family 列族 * @param qualifiers 列名 */ @Internal private[fire] def deleteQualifiers(tableName: String, rowKey: String, family: String, qualifiers: Seq[String], keyNum: Int = 1): Unit = { HBaseConnector(keyNum = keyNum).deleteQualifiers(tableName, rowKey, family, qualifiers: _*) } /** * 获取Configuration实例 * * @return HBase Configuration对象 */ def getConfiguration(keyNum: Int = 1): Configuration = HBaseConnector(keyNum = keyNum).getConfiguration /** * 校验类型合法性,class必须是HBaseBaseBean的子类 */ def checkClass[T: ClassTag](clazz: Class[_] = null): Unit = { val finalClazz = if (clazz != null) clazz else getParamType[T] if (finalClazz == null || finalClazz.getSuperclass != classOf[HBaseBaseBean[_]]) throw new IllegalArgumentException("请指定泛型类型,该泛型必须是HBaseBaseBean的子类,如:this.fire.hbasePutTable[JavaBean]") } } ================================================ FILE: fire-connectors/base-connectors/fire-hbase/src/main/scala/com/zto/fire/hbase/bean/HBaseBaseBean.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.hbase.bean; import com.zto.fire.common.anno.FieldName; import java.io.Serializable; /** * HBase封装bean需实现该接口 * Created by ChengLong on 2017-03-27. */ public abstract class HBaseBaseBean implements Serializable { /** * rowKey字段 */ @FieldName(value = "rowKey", disuse = true) public String rowKey; /** * 子类包名+类名 */ @FieldName(value = "className", disuse = true) public final String className = this.getClass().getSimpleName(); /** * 根据业务需要,构建rowkey */ public abstract T buildRowKey(); public String getRowKey() { return rowKey; } public void setRowKey(String rowKey) { this.rowKey = rowKey; } } ================================================ FILE: fire-connectors/base-connectors/fire-hbase/src/main/scala/com/zto/fire/hbase/bean/MultiVersionsBean.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.hbase.bean; import com.zto.fire.common.anno.FieldName; import com.zto.fire.common.util.JSONUtils; import org.apache.commons.beanutils.BeanUtils; import org.apache.commons.beanutils.ConvertUtils; import org.apache.commons.beanutils.converters.BigDecimalConverter; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.math.BigDecimal; import java.util.Map; /** * 多版本HBase实体Bean * Created by ChengLong on 2017-08-17. */ public class MultiVersionsBean extends HBaseBaseBean { @FieldName(value = "logger", disuse = true) private static final transient Logger logger = LoggerFactory.getLogger(MultiVersionsBean.class); @FieldName("multiFields") private String multiFields; @FieldName(value = "HBaseBaseBean", disuse = true) private HBaseBaseBean target; @FieldName(value = "BIGDECIMAL_ZERO", disuse = true) private static final BigDecimal BIGDECIMAL_ZERO = new BigDecimal("0"); static { // 这里一定要注册默认值,使用null也可以 BigDecimalConverter bd = new BigDecimalConverter(BIGDECIMAL_ZERO); ConvertUtils.register(bd, java.math.BigDecimal.class); } public String getMultiFields() { return multiFields; } public void setMultiFields(String multiFields) { this.multiFields = multiFields; } public HBaseBaseBean getTarget() { return target; } public void setTarget(HBaseBaseBean target) { this.target = target; } public MultiVersionsBean(HBaseBaseBean target) { this.target = (HBaseBaseBean) target.buildRowKey(); this.multiFields = JSONUtils.toJSONString(this.target); } public MultiVersionsBean() { } @Override public MultiVersionsBean buildRowKey() { try { if (this.target == null && StringUtils.isNotBlank(this.multiFields)) { Map map = JSONUtils.parseObject(this.multiFields, Map.class); Class clazz = Class.forName(map.get("className").toString()); HBaseBaseBean bean = (HBaseBaseBean) clazz.newInstance(); BeanUtils.populate(bean, map); this.target = (HBaseBaseBean) bean.buildRowKey(); } if (this.target != null) { this.target = (HBaseBaseBean) this.target.buildRowKey(); this.rowKey = this.target.rowKey; } } catch (Exception e) { logger.error("执行buildRowKey()方法失败", e); } return this; } } ================================================ FILE: fire-connectors/base-connectors/fire-hbase/src/main/scala/com/zto/fire/hbase/conf/FireHBaseConf.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.hbase.conf import java.util import com.zto.fire.common.util.PropUtils import com.zto.fire.predef._ /** * hbase相关配置 * * @author ChengLong * @since 1.1.0 * @create 2020-07-13 15:08 */ private[fire] object FireHBaseConf { lazy val HBASE_BATCH = "fire.hbase.batch.size" lazy val HBBASE_COLUMN_FAMILY_KEY = "hbase.column.family" lazy val HBASE_MAX_RETRY = "hbase.max.retry" lazy val HBASE_CLUSTER_URL = "hbase.cluster" lazy val HBASE_DURABILITY = "hbase.durability" // fire框架针对hbase操作后数据集的缓存策略,配置列表详见:StorageLevel.scala(配置不区分大小写) lazy val FIRE_HBASE_STORAGE_LEVEL = "fire.hbase.storage.level" // 通过HBase scan后repartition的分区数 @deprecated("use fire.hbase.scan.partitions", "v1.0.0") lazy val FIRE_HBASE_SCAN_REPARTITIONS = "fire.hbase.scan.repartitions" lazy val FIRE_HBASE_SCAN_PARTITIONS = "fire.hbase.scan.partitions" // hbase集群映射配置前缀 lazy val hbaseClusterMapPrefix = "fire.hbase.cluster.map." // 是否开启HBase表存在判断的缓存 lazy val TABLE_EXISTS_CACHE_ENABLE = "fire.hbase.table.exists.cache.enable" // 是否开启HBase表存在列表缓存的定时更新任务 lazy val TABLE_EXISTS_CACHE_RELOAD_ENABLE = "fire.hbase.table.exists.cache.reload.enable" // 定时刷新缓存HBase表任务的初始延迟 lazy val TABLE_EXISTS_CACHE_INITIAL_DELAY = "fire.hbase.table.exists.cache.initialDelay" // 定时刷新缓存HBase表任务的执行频率 lazy val TABLE_EXISTS_CACHE_PERIOD = "fire.hbase.table.exists.cache.period" // hbase集群映射地址 lazy val hbaseClusterMap: util.Map[String, String] = PropUtils.sliceKeys(this.hbaseClusterMapPrefix) // hbase java api 配置前缀 lazy val hbaseConfPrefix = "fire.hbase.conf." // 是否开启HBase表存在判断的缓存 def tableExistsCache(keyNum: Int = 1): Boolean = PropUtils.getBoolean(this.TABLE_EXISTS_CACHE_ENABLE, true, keyNum) // 是否开启HBase表存在列表缓存的定时更新任务 def tableExistsCacheReload(keyNum: Int = 1): Boolean = PropUtils.getBoolean(this.TABLE_EXISTS_CACHE_RELOAD_ENABLE, true, keyNum) // 定时刷新缓存HBase表任务的初始延迟 def tableExistCacheInitialDelay(keyNum: Int = 1): Long = PropUtils.getLong(this.TABLE_EXISTS_CACHE_INITIAL_DELAY, 60, keyNum) // 定时刷新缓存HBase表任务的执行频率 def tableExistCachePeriod(keyNum: Int = 1): Long = PropUtils.getLong(this.TABLE_EXISTS_CACHE_PERIOD, 600, keyNum) // HBase操作默认的批次大小 def hbaseBatchSize(keyNum: Int = 1): Int = PropUtils.getInt(this.HBASE_BATCH, 10000, keyNum) // hbase默认的列族名称,如果使用FieldName指定,则会被覆盖 def familyName(keyNum: Int = 1): String = PropUtils.getString(this.HBBASE_COLUMN_FAMILY_KEY, "info", keyNum) // hbase操作失败最大重试次数 def hbaseMaxRetry(keyNum: Int = 1): Long = PropUtils.getLong(this.HBASE_MAX_RETRY, 3, keyNum) // hbase集群名称 def hbaseCluster(keyNum: Int = 1): String = PropUtils.getString(this.HBASE_CLUSTER_URL, "", keyNum) /** * 根据给定的HBase集群别名获取对应的hbase.zookeeper.quorum地址 */ def hbaseClusterUrl(keyNum: Int = 1): String = { val clusterName = this.hbaseCluster(keyNum) this.hbaseClusterMap.getOrElse(clusterName, clusterName) } /** * 根据给定的HBase集群别名获取对应的hbase.zookeeper.quorum地址 */ def hbaseClusterUrl(clusterName: String): String = { this.hbaseClusterMap.getOrElse(clusterName, clusterName) } def hbaseDurability(keyNum: Int = 1): String = PropUtils.getString(this.HBASE_DURABILITY, "", keyNum) // HBase结果集的缓存策略配置 def hbaseStorageLevel(keyNum: Int = 1): String = PropUtils.getString(this.FIRE_HBASE_STORAGE_LEVEL, "memory_and_disk_ser", keyNum).toUpperCase // 通过HBase scan后repartition的分区数,默认1200 def hbaseHadoopScanPartitions(keyNum: Int = 1): Int = { val partitions = PropUtils.getInt(this.FIRE_HBASE_SCAN_PARTITIONS, -1, keyNum) if (partitions != -1) partitions else PropUtils.getInt(this.FIRE_HBASE_SCAN_REPARTITIONS, 1200, keyNum) } } ================================================ FILE: fire-connectors/base-connectors/fire-hbase/src/main/scala/com/zto/fire/hbase/utils/HBaseUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.hbase.utils import org.apache.commons.lang3.StringUtils import org.apache.hadoop.hbase.client.Scan import org.apache.hadoop.hbase.protobuf.ProtobufUtil import org.apache.hadoop.hbase.util.Base64 /** * HBase 操作工具类 * * @author ChengLong 2019-6-23 13:36:16 */ private[fire] object HBaseUtils { /** * 将scan对象转为String * * @param scan * @return */ def convertScanToString(scan: Scan): String = { val proto = ProtobufUtil.toScan(scan) Base64.encodeBytes(proto.toByteArray) } /** * 将给定的字符串补齐指定的位数 * * @param str * @param length * @return */ def appendString(str: String, char: String, length: Int): String = { if (StringUtils.isNotBlank(str) && StringUtils.isNotBlank(char) && length > str.length) { val sb: StringBuilder = new StringBuilder(str) var i: Int = 0 while (i < length - str.length) { sb.append(char) i += 1 } sb.toString } else if (length == str.length) { str } else if (length < str.length && length > 0) { str.substring(0, length) } else { "" } } } ================================================ FILE: fire-connectors/base-connectors/fire-jdbc/pom.xml ================================================ 4.0.0 fire-connector-jdbc_${scala.binary.version} jar Fire : Connectors : Common : JDBC com.zto.fire fire-connectors-common 2.3.2-SNAPSHOT ../pom.xml mysql mysql-connector-java ${mysql.version} c3p0 c3p0 0.9.1.2 org.apache.derby derby 10.13.1.1 test org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-connectors/base-connectors/fire-jdbc/src/main/resources/driver.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # mysql = com.mysql.jdbc.Driver sqlserver = com.microsoft.sqlserver.jdbc.SQLServerDriver oracle = oracle.jdbc.driver.OracleDriver hive = org.apache.hive.jdbc.HiveDriver presto = com.facebook.presto.jdbc.PrestoDriver spark = org.apache.hive.jdbc.HiveDriver clickhouse = ru.yandex.clickhouse.ClickHouseDriver postgresql = org.postgresql.Driver impala = com.cloudera.impala.jdbc41.Driver derby = org.apache.derby.jdbc.EmbeddedDriver kylin = org.apache.kylin.jdbc.Driver ================================================ FILE: fire-connectors/base-connectors/fire-jdbc/src/main/scala/com/zto/fire/jdbc/JdbcConnector.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.jdbc import java.sql.{Connection, PreparedStatement, ResultSet, SQLException, Statement} import com.mchange.v2.c3p0.ComboPooledDataSource import com.zto.fire.common.enu.{Operation => FOperation} import com.zto.fire.common.anno.Internal import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.util.{LineageManager, LogUtils, ReflectionUtils, StringsUtils} import com.zto.fire.core.connector.{ConnectorFactory, FireConnector} import com.zto.fire.jdbc.conf.FireJdbcConf import com.zto.fire.jdbc.util.DBUtils import com.zto.fire.predef._ import java.lang.reflect.Method import scala.collection.mutable import scala.reflect.ClassTag /** * 数据库连接池(c3p0)工具类 * 封装了数据库常用的操作方法 * * @param conf * 代码级别的配置信息,允许为空,配置文件会覆盖相同配置项,也就是说配置文件拥有着跟高的优先级 * @param keyNum * 用于区分连接不同的数据源,不同配置源对应不同的Connector实例 * @author ChengLong 2020-11-27 10:31:03 */ class JdbcConnector(conf: JdbcConf = null, keyNum: Int = 1) extends FireConnector(keyNum = keyNum) { private[this] var connPool: ComboPooledDataSource = _ // 日志中sql截取的长度 private lazy val logSqlLength = FireFrameworkConf.logSqlLength private[this] var poolMethodMap: mutable.Map[JString, Method] = _ private[this] var username: String = _ private[this] var url: String = _ private[this] var dbType: String = "unknown" private[this] lazy val finallyCatchLog = "释放jdbc资源失败" /** * c3p0线程池初始化 */ override protected[fire] def open(): Unit = { tryWithLog { // 从配置文件中读取配置信息,并设置到ComboPooledDataSource对象中 this.logger.info(s"准备初始化数据库连接池[ ${FireJdbcConf.jdbcUrl(keyNum)} ]") // 支持url和别名两种配置方式 this.url = if (isEmpty(FireJdbcConf.jdbcUrl(keyNum)) && noEmpty(this.conf, this.conf.url)) this.conf.url else FireJdbcConf.jdbcUrl(keyNum) require(noEmpty(this.url), s"数据库url不能为空,keyNum=${this.keyNum}") val driverClass = if (isEmpty(FireJdbcConf.driverClass(keyNum)) && noEmpty(this.conf) && noEmpty(this.conf.driverClass)) this.conf.driverClass else FireJdbcConf.driverClass(keyNum) val autoDriver = if (isEmpty(driverClass)) DBUtils.parseDriverByUrl(this.url) else driverClass require(noEmpty(autoDriver), s"数据库driverClass不能为空,keyNum=${this.keyNum}") this.username = if (isEmpty(FireJdbcConf.user(keyNum)) && noEmpty(this.conf, this.conf.username)) this.conf.username else FireJdbcConf.user(keyNum) val password = if (isEmpty(FireJdbcConf.password(keyNum)) && noEmpty(this.conf, this.conf.password)) this.conf.password else FireJdbcConf.password(keyNum) // 识别数据源类型是oracle、mysql等 this.dbType = DBUtils.dbTypeParser(autoDriver, this.url) logger.info(s"Fire框架识别到当前jdbc数据源标识为:${this.dbType},keyNum=${this.keyNum}") // 创建c3p0数据库连接池实例 val pool = new ComboPooledDataSource(true) pool.setJdbcUrl(this.url) pool.setDriverClass(autoDriver) if (noEmpty(this.username)) pool.setUser(this.username) if (noEmpty(password)) pool.setPassword(password) pool.setMaxPoolSize(FireJdbcConf.maxPoolSize(keyNum)) pool.setMinPoolSize(FireJdbcConf.minPoolSize(keyNum)) pool.setAcquireIncrement(FireJdbcConf.acquireIncrement(keyNum)) pool.setInitialPoolSize(FireJdbcConf.initialPoolSize(keyNum)) pool.setMaxStatements(0) pool.setMaxStatementsPerConnection(0) pool.setMaxIdleTime(FireJdbcConf.maxIdleTime(keyNum)) // 加载以db.c3p0.conf.为前缀的配置项 this.installDBPoolProperties(pool, this.keyNum) this.connPool = pool this.logger.info(s"创建数据库连接池[ $keyNum ] driver: ${this.dbType}") }(this.logger, s"数据库连接池创建成功", s"初始化数据库连接池[ $keyNum ]失败") } /** * 设置数据库连接池相关的参数 * * @param pool * 连接池实例 * @param keyNum * 配置的数字后缀 */ @Internal private[this] def installDBPoolProperties(pool: ComboPooledDataSource, keyNum: Int): Unit = { if (noEmpty(pool, keyNum)) { try { // 获取以db.c3p0.conf.开头以keyNum结尾的所有配置项 val confMap = FireJdbcConf.c3p0ConfMap(keyNum) // 获取pool所有的防范 if (isEmpty(this.poolMethodMap)) this.poolMethodMap = ReflectionUtils.getAllMethods(classOf[ComboPooledDataSource]).map(t => (t._1.toUpperCase, t._2)) LogUtils.logMap(this.logger, confMap, s"c3p0 configuration. keyNum=$keyNum.") // 匹配配置文件中指定的c3p0参数 confMap.foreach(prop => { val upperConf = s"set${prop._1}".toUpperCase if (noEmpty(prop._2) && this.poolMethodMap.containsKey(upperConf)) { val method = this.poolMethodMap(upperConf) // 获取pool对象中所有set方法的参数类型,如:setMaxPoolSize( int maxPoolSize ) method.getParameterTypes.map(t => t.getName).foreach { // 根据方法参数的类型将参数的值转为对应的类型 case "int" => method.invoke(pool, new JInt(prop._2)) case "boolean" => method.invoke(pool, new JBoolean(prop._2)) case "java.lang.String" => method.invoke(pool, prop._2) case _ => this.logger.error(s"暂不支持的c3p0配置参数类型:${upperConf} 当前仅支持int、boolean、String") } } else { this.logger.warn(s"数据库连接池不支持的配置:${FireJdbcConf.JDBC_C3P0_CONF_PREFIX + prop._1}=${prop._2},请核实!") } }) } catch { case exception: Exception => this.logger.error("设置c3p0参数过程中出现异常,请检查以db.c3p0.conf.开头的配置项!", exception) } } } /** * 关闭c3p0数据库连接池 */ override protected def close(): Unit = { if (this.connPool != null) { this.connPool.close() logger.debug(s"释放jdbc 连接池成功. keyNum=$keyNum") } } /** * 从指定的连接池中获取一个连接 * * @return * 对应配置项的数据库连接 */ def getConnection: Connection = { tryWithReturn { val connection = this.connPool.getConnection this.logger.debug(s"获取数据库连接[ ${keyNum} ]成功") connection }(this.logger, catchLog = s"获取数据库连接[ ${FireJdbcConf.jdbcUrl(keyNum)} ]发生异常,请检查配置文件") } /** * 更新操作 * * @param sql * 待执行的sql语句 * @param params * sql中的参数 * @param connection * 传递已有的数据库连接,可满足跨api的同一事务提交的需求 * @param commit * 是否自动提交事务,默认为自动提交 * @param closeConnection * 是否关闭connection,默认关闭 * @return * 影响的记录数 */ def executeUpdate(sql: String, params: Seq[Any] = null, connection: Connection = null, commit: Boolean = true, closeConnection: Boolean = true): Long = { val conn = if (connection == null) this.getConnection else connection var retVal: Long = 0L var stat: PreparedStatement = null tryFinallyWithReturn { conn.setAutoCommit(false) stat = conn.prepareStatement(sql) // 设置值参数 if (params != null && params.nonEmpty) { var i: Int = 1 params.foreach(param => { stat.setObject(i, param) i += 1 }) } retVal = stat.executeUpdate if (commit) conn.commit() this.logger.info(s"executeUpdate success. keyNum: ${keyNum} count: $retVal") retVal } { this.release(sql, conn, stat, null, closeConnection) }(this.logger, s"${this.sqlBuriedPoint(sql, FOperation.UPDATE)}", s"executeUpdate failed. keyNum:${keyNum}\n${this.sqlBuriedPoint(sql, FOperation.UPDATE)}", finallyCatchLog) } /** * 执行批量更新操作 * * @param sql * 待执行的sql语句 * @param paramsList * sql的参数列表 * @param connection * 传递已有的数据库连接,可满足跨api的同一事务提交的需求 * @param commit * 是否自动提交事务,默认为自动提交 * @param closeConnection * 是否关闭connection,默认关闭 * @return * 影响的记录数 */ def executeBatch(sql: String, paramsList: Seq[Seq[Any]] = null, connection: Connection = null, commit: Boolean = true, closeConnection: Boolean = true): Array[Int] = { val conn = if (connection == null) this.getConnection else connection var stat: PreparedStatement = null var batch = 0 var count = 0 tryFinallyWithReturn { conn.setAutoCommit(false) stat = conn.prepareStatement(sql) if (paramsList != null && paramsList.nonEmpty) { paramsList.foreach(params => { var i = 1 params.foreach(param => { stat.setObject(i, param) i += 1 }) batch += 1 stat.addBatch() if (batch % FireJdbcConf.batchSize(keyNum) == 0) { stat.executeBatch() stat.clearBatch() } }) } // 执行批量更新 val retVal = stat.executeBatch if (commit) conn.commit() count = retVal.sum this.logger.info(s"executeBatch success. keyNum: ${keyNum} count: $count") retVal } { this.release(sql, conn, stat, null, closeConnection) }(this.logger, s"${this.sqlBuriedPoint(sql, FOperation.UPDATE)}", s"executeBatch failed. keyNum:${keyNum}\n${this.sqlBuriedPoint(sql, FOperation.UPDATE)}", finallyCatchLog) } /** * 执行查询操作,以JavaBean方式返回结果集 * * @param sql * 查询语句 * @param params * sql执行参数 * @param clazz * JavaBean类型 */ def executeQueryList[T <: Object : ClassTag](sql: String, params: Seq[Any] = null, clazz: Class[T]): List[T] = { this.executeQuery[List[T]](sql, params, rs => { DBUtils.resultSet2BeanList(rs, clazz).toList }) } /** * 执行查询操作 * * @param sql * 查询语句 * @param params * sql执行参数 * @param callback * 查询回调 */ def executeQuery[T](sql: String, params: Seq[Any] = null, callback: ResultSet => T): T = { val conn = this.getConnection var stat: PreparedStatement = null var rs: ResultSet = null tryFinallyWithReturn { stat = conn.prepareStatement(sql, ResultSet.TYPE_SCROLL_INSENSITIVE, ResultSet.CONCUR_READ_ONLY) if (params != null && params.nonEmpty) { var i = 1 params.foreach(param => { stat.setObject(i, param) i += 1 }) } rs = stat.executeQuery this.logger.info(s"executeQuery success. keyNum: ${keyNum} count: ${DBUtils.rowCount(rs)}") callback(rs) } { this.release(sql, conn, stat, rs) }(this.logger, s"${this.sqlBuriedPoint(sql, FOperation.UPDATE)}", s"executeQuery failed. keyNum:${keyNum}\n${this.sqlBuriedPoint(sql, FOperation.SELECT)}", finallyCatchLog) } /** * 释放jdbc资源的工具类 * * @param sql * 对应的sql语句 * @param conn * 数据库连接 * @param rs * 查询结果集 * @param stat * jdbc statement */ def release(sql: String, conn: Connection, stat: Statement, rs: ResultSet, closeConnection: Boolean = true): Unit = { try { if (rs != null) rs.close() } catch { case e: SQLException => { this.logger.error(s"close jdbc ResultSet failed. keyNum: ${keyNum}", e) throw e } } finally { try { if (stat != null) stat.close() } catch { case e: SQLException => { this.logger.error(s"close jdbc statement failed. keyNum: ${keyNum}", e) throw e } } finally { try { if (conn != null && closeConnection) conn.close() } catch { case e: SQLException => { this.logger.error(s"close jdbc connection failed. keyNum: ${keyNum}", e) throw e } } } } } /** * 工具方法,截取给定的SQL语句 */ @Internal private[this] def sqlBuriedPoint(sql: String, operation: FOperation): String = { try { LineageManager.addDBSql(this.dbType, this.url, this.username, sql, operation) StringsUtils.substring(sql, 0, this.logSqlLength) } catch { case _: Throwable => "" } } } /** * jdbc最基本的配置信息,如果配置文件中有,则会覆盖代码中的配置 * * @param url * 数据库的url * @param driverClass * jdbc驱动名称 * @param username * 数据库用户名 * @param password * 数据库密码 */ case class JdbcConf(url: String, driverClass: String, username: String, password: String) /** * 用于单例构建伴生类JdbcConnector的实例对象 * 每个JdbcConnector实例使用keyNum作为标识,并且与每个关系型数据库一一对应 */ object JdbcConnector extends ConnectorFactory[JdbcConnector] with JdbcFunctions { /** * 约定创建connector子类实例的方法 */ override protected def create(conf: Any = null, keyNum: Int = 1): JdbcConnector = { requireNonEmpty(keyNum) val connector = new JdbcConnector(conf.asInstanceOf[JdbcConf], keyNum) logger.debug(s"创建JdbcConnector实例成功. keyNum=$keyNum") connector } } ================================================ FILE: fire-connectors/base-connectors/fire-jdbc/src/main/scala/com/zto/fire/jdbc/JdbcConnectorBridge.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.jdbc import java.sql.{Connection, ResultSet} import scala.reflect.ClassTag /** * jdbc操作简单封装 * * @author ChengLong * @since 1.1.0 * @create 2020-05-22 15:55 */ private[fire] trait JdbcConnectorBridge { /** * 关系型数据库插入、删除、更新操作 * * @param sql * 待执行的sql语句 * @param params * sql中的参数 * @param connection * 传递已有的数据库连接 * @param commit * 是否自动提交事务,默认为自动提交 * @param closeConnection * 是否关闭connection,默认关闭 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 * @return * 影响的记录数 */ def jdbcUpdate(sql: String, params: Seq[Any] = null, connection: Connection = null, commit: Boolean = true, closeConnection: Boolean = true, keyNum: Int = 1): Long = { JdbcConnector.executeUpdate(sql, params, connection, commit, closeConnection, keyNum) } /** * 关系型数据库批量插入、删除、更新操作 * * @param sql * 待执行的sql语句 * @param paramsList * sql的参数列表 * @param connection * 传递已有的数据库连接 * @param commit * 是否自动提交事务,默认为自动提交 * @param closeConnection * 是否关闭connection,默认关闭 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 * @return * 影响的记录数 */ def jdbcBatchUpdate(sql: String, paramsList: Seq[Seq[Any]] = null, connection: Connection = null, commit: Boolean = true, closeConnection: Boolean = true, keyNum: Int = 1): Array[Int] = { JdbcConnector.executeBatch(sql, paramsList, connection, commit, closeConnection, keyNum) } /** * 执行查询操作,以JavaBean方式返回结果集 * * @param sql * 查询语句 * @param params * sql执行参数 * @param clazz * JavaBean类型 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 * @return * 查询结果集 */ def jdbcQueryList[T <: Object : ClassTag](sql: String, params: Seq[Any] = null, clazz: Class[T], keyNum: Int = 1): List[T] = { JdbcConnector.executeQueryList[T](sql, params, clazz, keyNum) } /** * 执行查询操作,并在QueryCallback对结果集进行处理 * * @param sql * 查询语句 * @param params * sql执行参数 * @param callback * 查询回调 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 */ def jdbcQuery[T](sql: String, params: Seq[Any] = null, callback: ResultSet => T, keyNum: Int = 1): T = { JdbcConnector.executeQuery(sql, params, callback, keyNum) } } ================================================ FILE: fire-connectors/base-connectors/fire-jdbc/src/main/scala/com/zto/fire/jdbc/JdbcFunctions.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.jdbc import java.sql.{Connection, ResultSet} import scala.reflect.ClassTag /** * Jdbc api集合 * * @author ChengLong * @since 2.0.0 * @create 2020-12-23 15:49 */ private[fire] trait JdbcFunctions { /** * 根据指定的keyNum获取对应的数据库连接 */ def getConnection(keyNum: Int = 1): Connection = JdbcConnector(keyNum = keyNum).getConnection /** * 更新操作 * * @param sql * 待执行的sql语句 * @param params * sql中的参数 * @param connection * 传递已有的数据库连接,可满足跨api的同一事务提交的需求 * @param commit * 是否自动提交事务,默认为自动提交 * @param closeConnection * 是否关闭connection,默认关闭 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 * @return * 影响的记录数 */ def executeUpdate(sql: String, params: Seq[Any] = null, connection: Connection = null, commit: Boolean = true, closeConnection: Boolean = true, keyNum: Int = 1): Long = { JdbcConnector(keyNum = keyNum).executeUpdate(sql, params, connection, commit, closeConnection) } /** * 执行批量更新操作 * * @param sql * 待执行的sql语句 * @param paramsList * sql的参数列表 * @param connection * 传递已有的数据库连接,可满足跨api的同一事务提交的需求 * @param commit * 是否自动提交事务,默认为自动提交 * @param closeConnection * 是否关闭connection,默认关闭 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 * @return * 影响的记录数 */ def executeBatch(sql: String, paramsList: Seq[Seq[Any]] = null, connection: Connection = null, commit: Boolean = true, closeConnection: Boolean = true, keyNum: Int = 1): Array[Int] = { JdbcConnector(keyNum = keyNum).executeBatch(sql, paramsList, connection, commit, closeConnection) } /** * 执行查询操作,以JavaBean方式返回结果集 * * @param sql * 查询语句 * @param params * sql执行参数 * @param clazz * JavaBean类型 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 */ def executeQueryList[T <: Object : ClassTag](sql: String, params: Seq[Any] = null, clazz: Class[T], keyNum: Int = 1): List[T] = { JdbcConnector(keyNum = keyNum).executeQueryList(sql, params, clazz) } /** * 执行查询操作 * * @param sql * 查询语句 * @param params * sql执行参数 * @param callback * 查询回调 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 */ def executeQuery[T](sql: String, params: Seq[Any] = null, callback: ResultSet => T, keyNum: Int = 1): T = { JdbcConnector(keyNum = keyNum).executeQuery(sql, params, callback) } } ================================================ FILE: fire-connectors/base-connectors/fire-jdbc/src/main/scala/com/zto/fire/jdbc/conf/FireJdbcConf.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.jdbc.conf import com.zto.fire.common.util.PropUtils /** * 关系型数据库连接池相关配置 * * @author ChengLong * @since 1.1.0 * @create 2020-07-13 14:56 */ private[fire] object FireJdbcConf { // c3p0连接池相关配置 lazy val JDBC_URL = "db.jdbc.url" lazy val JDBC_URL_PREFIX = "db.jdbc.url.map." lazy val JDBC_DRIVER = "db.jdbc.driver" lazy val JDBC_USER = "db.jdbc.user" lazy val JDBC_PASSWORD = "db.jdbc.password" lazy val JDBC_ISOLATION_LEVEL = "db.jdbc.isolation.level" lazy val JDBC_MAX_POOL_SIZE = "db.jdbc.maxPoolSize" lazy val JDBC_MIN_POOL_SIZE = "db.jdbc.minPoolSize" lazy val JDBC_ACQUIRE_INCREMENT = "db.jdbc.acquireIncrement" lazy val JDBC_INITIAL_POOL_SIZE = "db.jdbc.initialPoolSize" lazy val JDBC_MAX_IDLE_TIME = "db.jdbc.maxIdleTime" lazy val JDBC_BATCH_SIZE = "db.jdbc.batch.size" lazy val JDBC_FLUSH_INTERVAL = "db.jdbc.flushInterval" lazy val JDBC_MAX_RETRY = "db.jdbc.max.retry" // c3p0数据库连接池相关配置 lazy val JDBC_C3P0_CONF_PREFIX = "db.c3p0.conf." // fire框架针对jdbc操作后数据集的缓存策略 lazy val FIRE_JDBC_STORAGE_LEVEL = "fire.jdbc.storage.level" // 通过JdbcConnector查询后将数据集放到多少个分区中,需根据实际的结果集做配置 lazy val FIRE_JDBC_QUERY_REPARTITION = "fire.jdbc.query.partitions" // 默认的事务隔离级别 lazy val jdbcIsolationLevel = "READ_UNCOMMITTED" // 数据库批量操作的记录数 lazy val jdbcBatchSize = 1000 // fire框架针对jdbc操作后数据集的缓存策略 lazy val jdbcStorageLevel = PropUtils.getString(this.FIRE_JDBC_STORAGE_LEVEL, "memory_and_disk_ser").toUpperCase // 通过JdbcConnector查询后将数据集放到多少个分区中,需根据实际的结果集做配置 lazy val jdbcQueryPartition = PropUtils.getInt(this.FIRE_JDBC_QUERY_REPARTITION, 10) // db.jdbc.url def url(keyNum: Int = 1): String = PropUtils.getString(this.JDBC_URL, "", keyNum) // jdbc url与别名映射 lazy val jdbcUrlMap = PropUtils.sliceKeys(this.JDBC_URL_PREFIX) // db.jdbc.driver def driverClass(keyNum: Int = 1): String = PropUtils.getString(this.JDBC_DRIVER,"", keyNum) // db.jdbc.user def user(keyNum: Int = 1): String = PropUtils.getString(this.JDBC_USER, "", keyNum = keyNum) // db.jdbc.password def password(keyNum: Int = 1): String = PropUtils.getString(this.JDBC_PASSWORD, "", keyNum = keyNum) // 事务的隔离级别:NONE, READ_COMMITTED, READ_UNCOMMITTED, REPEATABLE_READ, SERIALIZABLE,默认为READ_UNCOMMITTED def isolationLevel(keyNum: Int = 1): String = PropUtils.getString(this.JDBC_ISOLATION_LEVEL, this.jdbcIsolationLevel, keyNum) // 批量操作的记录数 def batchSize(keyNum: Int = 1): Int = PropUtils.getInt(this.JDBC_BATCH_SIZE, this.jdbcBatchSize, keyNum) // 默认多少毫秒flush一次 def jdbcFlushInterval(keyNum: Int = 1): Long = PropUtils.getLong(this.JDBC_FLUSH_INTERVAL, 1000, keyNum) // jdbc失败最大重试次数 def maxRetry(keyNum: Int = 1): Long = PropUtils.getLong(this.JDBC_MAX_RETRY, 3, keyNum) // 连接池最小连接数 def minPoolSize(keyNum: Int = 1): Int = PropUtils.getInt(this.JDBC_MIN_POOL_SIZE, 1, keyNum) // 连接池初始化连接数 def initialPoolSize(keyNum: Int = 1): Int = PropUtils.getInt(this.JDBC_INITIAL_POOL_SIZE, 1, keyNum) // 连接池最大连接数 def maxPoolSize(keyNum: Int = 1): Int = PropUtils.getInt(this.JDBC_MAX_POOL_SIZE, 5, keyNum) // 连接池每次自增连接数 def acquireIncrement(keyNum: Int = 1): Int = PropUtils.getInt(this.JDBC_ACQUIRE_INCREMENT, 1, keyNum) // 多久释放没有用到的连接 def maxIdleTime(keyNum: Int = 1): Int = PropUtils.getInt(this.JDBC_MAX_IDLE_TIME, 30, keyNum) // c3p0相关配置 def c3p0ConfMap(keyNum: Int = 1): collection.immutable.Map[String, String] = PropUtils.sliceKeysByNum(this.JDBC_C3P0_CONF_PREFIX, keyNum) /** * 根据给定的jdbc url别名获取对应的jdbc地址 */ def jdbcUrl(keyNum: Int = 1): String = { val url = this.url(keyNum) this.jdbcUrl(url) } /** * 根据别名获取jdbc的url */ def jdbcUrl(url: String): String = { this.jdbcUrlMap.getOrElse(url, url) } } ================================================ FILE: fire-connectors/base-connectors/fire-jdbc/src/main/scala/com/zto/fire/jdbc/util/DBUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.jdbc.util import com.google.common.collect.Maps import com.zto.fire.common.anno.FieldName import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.enu.Datasource import com.zto.fire.common.util.{Logging, ReflectionUtils} import com.zto.fire.jdbc.conf.FireJdbcConf import com.zto.fire.predef._ import org.apache.commons.lang3.StringUtils import java.sql.{ResultSet, Types} import java.util.Properties import scala.collection.mutable.ListBuffer import scala.reflect.ClassTag import scala.util.Try /** * 关系型数据库操作工具类 * * @author ChengLong 2019-6-23 11:16:18 */ object DBUtils extends Logging { private lazy val driverFile = "driver.properties" // 读取配置文件,获取jdbc url与driver的映射关系 private lazy val driverMap = { tryWithReturn { val properties = new Properties() properties.load(this.getClass.getClassLoader.getResourceAsStream(this.driverFile)) Maps.fromProperties(properties) } (this.logger, s"加载${this.driverFile}成功", s"加载${this.driverFile}失败,请确认该配置文件是否存在!") } /** * 将ResultSet结果转为JavaBean集合 * * @param rs 数据库中的查询结果集 * @param clazz 目标JavaBean类型 * @return 将ResultSet转换为JavaBean集合返回 */ def resultSet2BeanList[T](rs: ResultSet, clazz: Class[T]): ListBuffer[T] = { val list = ListBuffer[T]() val fields = clazz.getDeclaredFields try { val columnMap = this.columns(rs) while (rs.next()) { val obj = clazz.newInstance() fields.foreach(field => { ReflectionUtils.setAccessible(field) val anno = field.getAnnotation(classOf[FieldName]) if (!(anno != null && anno.disuse())) { val fieldName = if (anno != null && StringUtils.isNotBlank(anno.value())) anno.value() else field.getName if (columnMap.containsKey(fieldName)) { val fieldType = columnMap.get(fieldName) fieldType match { case Types.INTEGER | Types.SMALLINT=> field.set(obj, rs.getInt(fieldName)) case Types.VARCHAR | Types.CHAR | Types.LONGVARCHAR => field.set(obj, rs.getString(fieldName)) case Types.BIGINT => field.set(obj, rs.getLong(fieldName)) case Types.FLOAT => field.set(obj, rs.getFloat(fieldName)) case Types.DOUBLE => field.set(obj, rs.getDouble(fieldName)) case Types.DECIMAL => field.set(obj, rs.getBigDecimal(fieldName)) case Types.BOOLEAN | Types.TINYINT | Types.BIT => field.set(obj, rs.getBoolean(fieldName)) case Types.DATE => field.set(obj, rs.getDate(fieldName)) case Types.TIME => field.set(obj, rs.getTime(fieldName)) case Types.TIMESTAMP => field.set(obj, rs.getTimestamp(fieldName)) case _ => logger.error(s"ResultSet转换成JavaBean过程中遇到不支持的类型,字段名称:${fieldName},字段类型:${fieldType}") } } } }) list += obj } } catch { case e: Exception => logger.error("ResultSet转换成JavaBean过程中出现异常.", e) throw e } list } /** * 判断指定的结果集中是否包含指定的列名 * * @param rs * 关系型数据库查询结果集 * @param columnName * 列名 * @return * true: 存在 false:不存在 */ def containsColumn(rs: ResultSet, columnName: String): Boolean = { val start = currentTime val retVal = Try { try { rs.findColumn(columnName) } } if (retVal.isFailure) this.logger.warn(s"ResultSet结果集中未找到列名:${columnName},请保证ResultSet与JavaBean中的字段一一对应,耗时:${elapsed(start)}") rs.getMetaData retVal.isSuccess } /** * 根据查询结果集获取字段名称与类型的映射关系 * @param rs * jdbc query结果集 * @return * Map[FieldName, FieldType] */ def columns(rs: ResultSet): JHashMap[String, Int] = { val metaData = rs.getMetaData val fieldMap = new JHashMap[String, Int]() for (i <- 1 until metaData.getColumnCount) { val fieldName = metaData.getColumnName(i) val fieldType = metaData.getColumnType(i) fieldMap.put(fieldName, fieldType) } fieldMap } /** * 获取ResultSet返回的记录数 * * @param rs * 查询结果集 * @return * 结果集行数 */ def rowCount(rs: ResultSet): Int = { if (rs == null) return 0 rs.last() val count = rs.getRow rs.beforeFirst() count } /** * 获取jdbc连接信息,若调用者指定,以调用者为准,否则读取配置文件 * * @param jdbcProps * 调用者传入的jdbc配置信息 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 * @return * jdbc配置信息 */ def getJdbcProps(jdbcProps: Properties = null, keyNum: Int = 1): Properties = { if (jdbcProps == null || jdbcProps.size() == 0) { val defaultProps = new Properties() defaultProps.setProperty("user", FireJdbcConf.user(keyNum)) defaultProps.setProperty("password", FireJdbcConf.password(keyNum)) defaultProps.setProperty("driver", FireJdbcConf.driverClass(keyNum)) defaultProps.setProperty("batchsize", FireJdbcConf.batchSize(keyNum).toString) defaultProps.setProperty("isolationLevel", FireJdbcConf.isolationLevel(keyNum).toUpperCase) defaultProps } else { jdbcProps } } /** * 根据jdbc驱动包名或数据库url区分连接的不同的数据库厂商标识 */ def dbTypeParser(driverClass: String, url: String): String = { var dbType = "unknown" Datasource.values().map(_.toString).foreach(datasource => { if (driverClass.toUpperCase.contains(datasource)) dbType = datasource }) // 尝试从url中的端口号解析,对结果进行校正,因为有些数据库使用的是mysql驱动,可以通过url中的端口号区分 if (StringUtils.isNotBlank(url)) { FireFrameworkConf.lineageDatasourceMap.foreach(kv => { if (url.contains(kv._2)) dbType = kv._1.toUpperCase }) } dbType } /** * 通过解析jdbc url,返回url对应的已知的driver class * * @param url * jdbc url * @return * driver class */ def parseDriverByUrl(url: String): String = { var driver = "" // 尝试从url中的端口号解析,对结果进行校正,因为有些数据库使用的是mysql驱动,可以通过url中的端口号区分 if (StringUtils.isNotBlank(url)) { this.driverMap.foreach(kv => { if (url.toLowerCase.contains(kv._1)) driver = kv._2 }) } driver } } ================================================ FILE: fire-connectors/base-connectors/pom.xml ================================================ 4.0.0 fire-connectors-common pom Fire : Connectors : Common : com.zto.fire fire-connectors 2.3.2-SNAPSHOT ../pom.xml fire-jdbc fire-hbase com.zto.fire fire-common_${scala.binary.version} ${fire.version} ${maven.scope} com.zto.fire fire-core_${scala.binary.version} ${fire.version} ${maven.scope} com.zto.fire fire-metrics_${scala.binary.version} ${fire.version} ${maven.scope} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/pom.xml ================================================ 4.0.0 fire-connector-flink-clickhouse_${flink.reference} jar Fire : Connectors : Fink : Clickhouse fire-flink-connectors com.zto.fire 2.3.2-SNAPSHOT ru.yandex.clickhouse clickhouse-jdbc ${clickhouse-jdbc.version} com.fasterxml.jackson.core jackson-core com.fasterxml.jackson.core jackson-databind org.apache.flink flink-table-api-java ${flink.version} ${maven.scope} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/ClickHouseDynamicTableFactory.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse; import org.apache.flink.configuration.ConfigOption; import org.apache.flink.configuration.ReadableConfig; import org.apache.flink.connector.clickhouse.internal.options.ClickHouseDmlOptions; import org.apache.flink.connector.clickhouse.internal.options.ClickHouseReadOptions; import org.apache.flink.table.api.TableSchema; import org.apache.flink.table.connector.sink.DynamicTableSink; import org.apache.flink.table.connector.source.DynamicTableSource; import org.apache.flink.table.factories.DynamicTableSinkFactory; import org.apache.flink.table.factories.DynamicTableSourceFactory; import org.apache.flink.table.factories.FactoryUtil; import org.apache.flink.table.factories.FactoryUtil.TableFactoryHelper; import org.apache.flink.table.utils.TableSchemaUtils; import java.util.Arrays; import java.util.HashSet; import java.util.Properties; import java.util.Set; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfig.IDENTIFIER; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.CATALOG_IGNORE_PRIMARY_KEY; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.DATABASE_NAME; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.PASSWORD; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SCAN_PARTITION_COLUMN; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SCAN_PARTITION_LOWER_BOUND; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SCAN_PARTITION_NUM; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SCAN_PARTITION_UPPER_BOUND; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SINK_BATCH_SIZE; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SINK_FLUSH_INTERVAL; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SINK_IGNORE_DELETE; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SINK_MAX_RETRIES; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SINK_PARTITION_KEY; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SINK_PARTITION_STRATEGY; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SINK_WRITE_LOCAL; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.TABLE_NAME; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.URL; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.USERNAME; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.USE_LOCAL; import static org.apache.flink.connector.clickhouse.internal.partitioner.ClickHousePartitioner.BALANCED; import static org.apache.flink.connector.clickhouse.internal.partitioner.ClickHousePartitioner.HASH; import static org.apache.flink.connector.clickhouse.internal.partitioner.ClickHousePartitioner.SHUFFLE; import static org.apache.flink.connector.clickhouse.util.ClickHouseUtil.getClickHouseProperties; /** A {@link DynamicTableSinkFactory} for discovering {@link ClickHouseDynamicTableSink}. */ public class ClickHouseDynamicTableFactory implements DynamicTableSinkFactory, DynamicTableSourceFactory { public ClickHouseDynamicTableFactory() {} @Override public DynamicTableSink createDynamicTableSink(Context context) { TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context); ReadableConfig config = helper.getOptions(); helper.validate(); validateConfigOptions(config); return new ClickHouseDynamicTableSink( getDmlOptions(config), context.getCatalogTable(), context.getCatalogTable().getResolvedSchema()); } @Override public DynamicTableSource createDynamicTableSource(Context context) { TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context); ReadableConfig config = helper.getOptions(); helper.validate(); validateConfigOptions(config); Properties clickHouseProperties = getClickHouseProperties(context.getCatalogTable().getOptions()); TableSchema physicalSchema = TableSchemaUtils.getPhysicalSchema(context.getCatalogTable().getSchema()); return new ClickHouseDynamicTableSource( getReadOptions(config), clickHouseProperties, context.getCatalogTable(), physicalSchema); } @Override public String factoryIdentifier() { return IDENTIFIER; } @Override public Set> requiredOptions() { Set> requiredOptions = new HashSet<>(); requiredOptions.add(URL); requiredOptions.add(TABLE_NAME); return requiredOptions; } @Override public Set> optionalOptions() { Set> optionalOptions = new HashSet<>(); optionalOptions.add(USERNAME); optionalOptions.add(PASSWORD); optionalOptions.add(DATABASE_NAME); optionalOptions.add(USE_LOCAL); optionalOptions.add(SINK_BATCH_SIZE); optionalOptions.add(SINK_FLUSH_INTERVAL); optionalOptions.add(SINK_MAX_RETRIES); optionalOptions.add(SINK_WRITE_LOCAL); optionalOptions.add(SINK_PARTITION_STRATEGY); optionalOptions.add(SINK_PARTITION_KEY); optionalOptions.add(SINK_IGNORE_DELETE); optionalOptions.add(CATALOG_IGNORE_PRIMARY_KEY); optionalOptions.add(SCAN_PARTITION_COLUMN); optionalOptions.add(SCAN_PARTITION_NUM); optionalOptions.add(SCAN_PARTITION_LOWER_BOUND); optionalOptions.add(SCAN_PARTITION_UPPER_BOUND); return optionalOptions; } private void validateConfigOptions(ReadableConfig config) { String partitionStrategy = config.get(SINK_PARTITION_STRATEGY); if (!Arrays.asList(HASH, BALANCED, SHUFFLE).contains(partitionStrategy)) { throw new IllegalArgumentException( String.format("Unknown sink.partition-strategy `%s`", partitionStrategy)); } else if (HASH.equals(partitionStrategy) && !config.getOptional(SINK_PARTITION_KEY).isPresent()) { throw new IllegalArgumentException( "A partition key must be provided for hash partition strategy"); } else if (config.getOptional(USERNAME).isPresent() ^ config.getOptional(PASSWORD).isPresent()) { throw new IllegalArgumentException( "Either all or none of username and password should be provided"); } else if (config.getOptional(SCAN_PARTITION_COLUMN).isPresent() ^ config.getOptional(SCAN_PARTITION_NUM).isPresent() ^ config.getOptional(SCAN_PARTITION_LOWER_BOUND).isPresent() ^ config.getOptional(SCAN_PARTITION_UPPER_BOUND).isPresent()) { throw new IllegalArgumentException( "Either all or none of partition configs should be provided"); } } private ClickHouseDmlOptions getDmlOptions(ReadableConfig config) { return new ClickHouseDmlOptions.Builder() .withUrl(config.get(URL)) .withUsername(config.get(USERNAME)) .withPassword(config.get(PASSWORD)) .withDatabaseName(config.get(DATABASE_NAME)) .withTableName(config.get(TABLE_NAME)) .withBatchSize(config.get(SINK_BATCH_SIZE)) .withFlushInterval(config.get(SINK_FLUSH_INTERVAL)) .withMaxRetries(config.get(SINK_MAX_RETRIES)) .withWriteLocal(config.get(SINK_WRITE_LOCAL)) .withUseLocal(config.get(USE_LOCAL)) .withPartitionStrategy(config.get(SINK_PARTITION_STRATEGY)) .withPartitionKey(config.get(SINK_PARTITION_KEY)) .withIgnoreDelete(config.get(SINK_IGNORE_DELETE)) .build(); } private ClickHouseReadOptions getReadOptions(ReadableConfig config) { return new ClickHouseReadOptions.Builder() .withUrl(config.get(URL)) .withUsername(config.get(USERNAME)) .withPassword(config.get(PASSWORD)) .withDatabaseName(config.get(DATABASE_NAME)) .withTableName(config.get(TABLE_NAME)) .withUseLocal(config.get(USE_LOCAL)) .withPartitionColumn(config.get(SCAN_PARTITION_COLUMN)) .withPartitionNum(config.get(SCAN_PARTITION_NUM)) .withPartitionLowerBound(config.get(SCAN_PARTITION_LOWER_BOUND)) .withPartitionUpperBound(config.get(SCAN_PARTITION_UPPER_BOUND)) .build(); } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/ClickHouseDynamicTableSink.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse; import org.apache.flink.connector.clickhouse.internal.AbstractClickHouseOutputFormat; import org.apache.flink.connector.clickhouse.internal.options.ClickHouseDmlOptions; import org.apache.flink.table.catalog.CatalogTable; import org.apache.flink.table.catalog.Column; import org.apache.flink.table.catalog.ResolvedSchema; import org.apache.flink.table.connector.ChangelogMode; import org.apache.flink.table.connector.sink.DynamicTableSink; import org.apache.flink.table.connector.sink.OutputFormatProvider; import org.apache.flink.table.connector.sink.abilities.SupportsPartitioning; import org.apache.flink.table.types.DataType; import org.apache.flink.types.RowKind; import org.apache.flink.util.Preconditions; import java.util.LinkedHashMap; import java.util.Map; /** * A {@link DynamicTableSink} that describes how to create a {@link ClickHouseDynamicTableSink} from * a logical description. * *

TODO: Partitioning strategy isn't well implemented. */ public class ClickHouseDynamicTableSink implements DynamicTableSink, SupportsPartitioning { private final CatalogTable catalogTable; private final ResolvedSchema tableSchema; private final ClickHouseDmlOptions options; private boolean dynamicGrouping = false; private LinkedHashMap staticPartitionSpec = new LinkedHashMap<>(); public ClickHouseDynamicTableSink( ClickHouseDmlOptions options, CatalogTable catalogTable, ResolvedSchema tableSchema) { this.options = options; this.catalogTable = catalogTable; this.tableSchema = tableSchema; } @Override public ChangelogMode getChangelogMode(ChangelogMode requestedMode) { validatePrimaryKey(requestedMode); return ChangelogMode.newBuilder() .addContainedKind(RowKind.INSERT) .addContainedKind(RowKind.UPDATE_AFTER) .addContainedKind(RowKind.DELETE) .build(); } private void validatePrimaryKey(ChangelogMode requestedMode) { Preconditions.checkState( ChangelogMode.insertOnly().equals(requestedMode) || tableSchema.getPrimaryKey().isPresent(), "Please declare primary key for sink table when query contains update/delete record."); } @Override public SinkRuntimeProvider getSinkRuntimeProvider(Context context) { String[] fieldNames = tableSchema.getColumns().stream() .filter(Column::isPhysical) .map(Column::getName) .toArray(String[]::new); DataType[] fieldTypes = tableSchema.getColumns().stream() .filter(Column::isPhysical) .map(Column::getDataType) .toArray(DataType[]::new); AbstractClickHouseOutputFormat outputFormat = new AbstractClickHouseOutputFormat.Builder() .withOptions(options) .withFieldNames(fieldNames) .withFieldDataTypes(fieldTypes) .withPrimaryKey(tableSchema.getPrimaryKey().orElse(null)) .withPartitionKey(catalogTable.getPartitionKeys()) .build(); return OutputFormatProvider.of(outputFormat); } @Override public void applyStaticPartition(Map partition) { staticPartitionSpec = new LinkedHashMap<>(); for (String partitionCol : catalogTable.getPartitionKeys()) { if (partition.containsKey(partitionCol)) { staticPartitionSpec.put(partitionCol, partition.get(partitionCol)); } } } @Override public boolean requiresPartitionGrouping(boolean supportsGrouping) { this.dynamicGrouping = supportsGrouping; return supportsGrouping; } @Override public DynamicTableSink copy() { ClickHouseDynamicTableSink sink = new ClickHouseDynamicTableSink(options, catalogTable, tableSchema); sink.dynamicGrouping = dynamicGrouping; sink.staticPartitionSpec = staticPartitionSpec; return sink; } @Override public String asSummaryString() { return "ClickHouse table sink"; } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/ClickHouseDynamicTableSource.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse; import org.apache.flink.connector.clickhouse.internal.AbstractClickHouseInputFormat; import org.apache.flink.connector.clickhouse.internal.options.ClickHouseReadOptions; import org.apache.flink.connector.clickhouse.util.FilterPushDownHelper; import org.apache.flink.table.api.TableSchema; import org.apache.flink.table.catalog.CatalogTable; import org.apache.flink.table.connector.ChangelogMode; import org.apache.flink.table.connector.source.DynamicTableSource; import org.apache.flink.table.connector.source.InputFormatProvider; import org.apache.flink.table.connector.source.ScanTableSource; import org.apache.flink.table.connector.source.abilities.SupportsFilterPushDown; import org.apache.flink.table.connector.source.abilities.SupportsLimitPushDown; import org.apache.flink.table.connector.source.abilities.SupportsProjectionPushDown; import org.apache.flink.table.expressions.ResolvedExpression; import org.apache.flink.table.utils.TableSchemaUtils; import java.util.ArrayList; import java.util.List; import java.util.Properties; /** ClickHouse table source. */ public class ClickHouseDynamicTableSource implements ScanTableSource, SupportsProjectionPushDown, SupportsLimitPushDown, SupportsFilterPushDown { private final ClickHouseReadOptions readOptions; private final Properties connectionProperties; private final CatalogTable catalogTable; private TableSchema physicalSchema; private String filterClause; private long limit = -1L; public ClickHouseDynamicTableSource( ClickHouseReadOptions readOptions, Properties properties, CatalogTable catalogTable, TableSchema physicalSchema) { this.readOptions = readOptions; this.connectionProperties = properties; this.catalogTable = catalogTable; this.physicalSchema = physicalSchema; } @Override public ChangelogMode getChangelogMode() { return ChangelogMode.insertOnly(); } @Override public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderContext) { AbstractClickHouseInputFormat.Builder builder = new AbstractClickHouseInputFormat.Builder() .withOptions(readOptions) .withConnectionProperties(connectionProperties) .withFieldNames(physicalSchema.getFieldNames()) .withFieldTypes(physicalSchema.getFieldDataTypes()) .withRowDataTypeInfo( runtimeProviderContext.createTypeInformation( physicalSchema.toRowDataType())) .withFilterClause(filterClause) .withLimit(limit); return InputFormatProvider.of(builder.build()); } @Override public DynamicTableSource copy() { ClickHouseDynamicTableSource source = new ClickHouseDynamicTableSource( readOptions, connectionProperties, catalogTable, physicalSchema); source.filterClause = filterClause; source.limit = limit; return source; } @Override public String asSummaryString() { return "ClickHouse table source"; } @Override public Result applyFilters(List filters) { this.filterClause = FilterPushDownHelper.convert(filters); return Result.of(new ArrayList<>(filters), new ArrayList<>(filters)); } @Override public void applyLimit(long limit) { this.limit = limit; } @Override public boolean supportsNestedProjection() { return false; } @Override public void applyProjection(int[][] projectedFields) { this.physicalSchema = TableSchemaUtils.projectSchema(physicalSchema, projectedFields); } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/catalog/ClickHouseCatalog.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.catalog; import org.apache.flink.connector.clickhouse.ClickHouseDynamicTableFactory; import org.apache.flink.connector.clickhouse.internal.common.DistributedEngineFullSchema; import org.apache.flink.connector.clickhouse.util.ClickHouseTypeUtil; import org.apache.flink.connector.clickhouse.util.ClickHouseUtil; import org.apache.flink.table.api.TableSchema; import org.apache.flink.table.catalog.AbstractCatalog; import org.apache.flink.table.catalog.CatalogBaseTable; import org.apache.flink.table.catalog.CatalogDatabase; import org.apache.flink.table.catalog.CatalogDatabaseImpl; import org.apache.flink.table.catalog.CatalogFunction; import org.apache.flink.table.catalog.CatalogPartition; import org.apache.flink.table.catalog.CatalogPartitionSpec; import org.apache.flink.table.catalog.CatalogTableImpl; import org.apache.flink.table.catalog.ObjectPath; import org.apache.flink.table.catalog.exceptions.CatalogException; import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; import org.apache.flink.table.catalog.exceptions.FunctionAlreadyExistException; import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; import org.apache.flink.table.catalog.exceptions.PartitionAlreadyExistsException; import org.apache.flink.table.catalog.exceptions.PartitionNotExistException; import org.apache.flink.table.catalog.exceptions.PartitionSpecInvalidException; import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; import org.apache.flink.table.catalog.exceptions.TableNotExistException; import org.apache.flink.table.catalog.exceptions.TableNotPartitionedException; import org.apache.flink.table.catalog.exceptions.TablePartitionedException; import org.apache.flink.table.catalog.stats.CatalogColumnStatistics; import org.apache.flink.table.catalog.stats.CatalogTableStatistics; import org.apache.flink.table.expressions.Expression; import org.apache.flink.table.factories.Factory; import org.apache.flink.table.types.DataType; import org.apache.flink.util.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import ru.yandex.clickhouse.BalancedClickhouseDataSource; import ru.yandex.clickhouse.ClickHouseConnection; import ru.yandex.clickhouse.response.ClickHouseColumnInfo; import ru.yandex.clickhouse.response.ClickHouseResultSetMetaData; import ru.yandex.clickhouse.settings.ClickHouseQueryParam; import javax.annotation.Nullable; import java.lang.reflect.Method; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Properties; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfig.CATALOG_IGNORE_PRIMARY_KEY; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfig.DATABASE_NAME; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfig.PASSWORD; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfig.TABLE_NAME; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfig.URL; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfig.USERNAME; import static org.apache.flink.util.Preconditions.checkArgument; /** ClickHouse catalog. */ public class ClickHouseCatalog extends AbstractCatalog { private static final Logger LOG = LoggerFactory.getLogger(ClickHouseCatalog.class); public static final String DEFAULT_DATABASE = "default"; private final String baseUrl; private final String username; private final String password; private final boolean ignorePrimaryKey; private final Map properties; private ClickHouseConnection connection; public ClickHouseCatalog(String catalogName, Map properties) { this( catalogName, properties.get(DATABASE_NAME), properties.get(URL), properties.get(USERNAME), properties.get(PASSWORD), properties); } public ClickHouseCatalog( String catalogName, @Nullable String defaultDatabase, String baseUrl, String username, String password) { this(catalogName, defaultDatabase, baseUrl, username, password, Collections.emptyMap()); } public ClickHouseCatalog( String catalogName, @Nullable String defaultDatabase, String baseUrl, String username, String password, Map properties) { super(catalogName, defaultDatabase == null ? DEFAULT_DATABASE : defaultDatabase); checkArgument( !StringUtils.isNullOrWhitespaceOnly(baseUrl), "baseUrl cannot be null or empty"); checkArgument( !StringUtils.isNullOrWhitespaceOnly(username), "username cannot be null or empty"); checkArgument( !StringUtils.isNullOrWhitespaceOnly(password), "password cannot be null or empty"); this.baseUrl = baseUrl.endsWith("/") ? baseUrl : baseUrl + "/"; this.username = username; this.password = password; this.ignorePrimaryKey = properties.get(CATALOG_IGNORE_PRIMARY_KEY) == null || Boolean.parseBoolean(properties.get(CATALOG_IGNORE_PRIMARY_KEY)); this.properties = Collections.unmodifiableMap(properties); } @Override public void open() throws CatalogException { try { Properties configuration = new Properties(); configuration.putAll(properties); configuration.setProperty(ClickHouseQueryParam.USER.getKey(), username); configuration.setProperty(ClickHouseQueryParam.PASSWORD.getKey(), password); String jdbcUrl = ClickHouseUtil.getJdbcUrl(baseUrl, getDefaultDatabase()); BalancedClickhouseDataSource dataSource = new BalancedClickhouseDataSource(jdbcUrl, configuration); dataSource.actualize(); connection = dataSource.getConnection(); LOG.info("Created catalog {}, established connection to {}", getName(), jdbcUrl); } catch (Exception e) { throw new CatalogException(String.format("Opening catalog %s failed.", getName()), e); } } @Override public synchronized void close() throws CatalogException { try { connection.close(); LOG.info("Closed catalog {} ", getName()); } catch (Exception e) { throw new CatalogException(String.format("Closing catalog %s failed.", getName()), e); } } @Override public Optional getFactory() { return Optional.of(new ClickHouseDynamicTableFactory()); } // ------------- databases ------------- @Override public synchronized List listDatabases() throws CatalogException { // Sometimes we need to look up database `system`, so we won't get rid of it. try (PreparedStatement stmt = connection.prepareStatement("SELECT name from `system`.databases"); ResultSet rs = stmt.executeQuery()) { List databases = new ArrayList<>(); while (rs.next()) { databases.add(rs.getString(1)); } return databases; } catch (Exception e) { throw new CatalogException( String.format("Failed listing database in catalog %s", getName()), e); } } @Override public CatalogDatabase getDatabase(String databaseName) throws DatabaseNotExistException, CatalogException { if (listDatabases().contains(databaseName)) { return new CatalogDatabaseImpl(Collections.emptyMap(), null); } else { throw new DatabaseNotExistException(getName(), databaseName); } } @Override public boolean databaseExists(String databaseName) throws CatalogException { checkArgument(!StringUtils.isNullOrWhitespaceOnly(databaseName)); return listDatabases().contains(databaseName); } @Override public void createDatabase(String name, CatalogDatabase database, boolean ignoreIfExists) throws DatabaseAlreadyExistException, CatalogException { throw new UnsupportedOperationException(); } @Override public void dropDatabase(String name, boolean ignoreIfNotExists, boolean cascade) throws DatabaseNotEmptyException, CatalogException { throw new UnsupportedOperationException(); } @Override public void alterDatabase(String name, CatalogDatabase newDatabase, boolean ignoreIfNotExists) throws DatabaseNotExistException, CatalogException { throw new UnsupportedOperationException(); } // ------------- tables ------------- @Override public synchronized List listTables(String databaseName) throws DatabaseNotExistException, CatalogException { if (!databaseExists(databaseName)) { throw new DatabaseNotExistException(getName(), databaseName); } try (PreparedStatement stmt = connection.prepareStatement( String.format( "SELECT name from `system`.tables where database = '%s'", databaseName)); ResultSet rs = stmt.executeQuery()) { List tables = new ArrayList<>(); while (rs.next()) { tables.add(rs.getString(1)); } return tables; } catch (Exception e) { throw new CatalogException( String.format( "Failed listing tables in catalog %s database %s", getName(), databaseName), e); } } @Override public List listViews(String databaseName) throws DatabaseNotExistException, CatalogException { throw new UnsupportedOperationException(); } @Override public CatalogBaseTable getTable(ObjectPath tablePath) throws TableNotExistException, CatalogException { if (!tableExists(tablePath)) { throw new TableNotExistException(getName(), tablePath); } Map configuration = new HashMap<>(properties); configuration.put(URL, baseUrl); configuration.put(DATABASE_NAME, tablePath.getDatabaseName()); configuration.put(TABLE_NAME, tablePath.getObjectName()); configuration.put(USERNAME, username); configuration.put(PASSWORD, password); String databaseName = tablePath.getDatabaseName(); String tableName = tablePath.getObjectName(); try { DistributedEngineFullSchema engineFullSchema = ClickHouseUtil.getAndParseDistributedEngineSchema( connection, tablePath.getDatabaseName(), tablePath.getObjectName()); if (engineFullSchema != null) { databaseName = engineFullSchema.getDatabase(); tableName = engineFullSchema.getTable(); } } catch (Exception e) { throw new CatalogException( String.format( "Failed getting engine full of %s.%s.%s", getName(), databaseName, tableName), e); } return new CatalogTableImpl( createTableSchema(databaseName, tableName), getPartitionKeys(databaseName, tableName), configuration, ""); } private synchronized TableSchema createTableSchema(String databaseName, String tableName) { // 1.Maybe has compatibility problems with the different version of clickhouse jdbc. 2. Is // it more appropriate to use type literals from `system.columns` to convert Flink data // types? 3. All queried data will be obtained before PreparedStatement is closed, so we // must add `limit 0` statement to avoid data transmission to the client, look at // `ChunkedInputStream.close()` for more info. try (PreparedStatement stmt = connection.prepareStatement( String.format( "SELECT * from `%s`.`%s` limit 0", databaseName, tableName))) { ClickHouseResultSetMetaData metaData = stmt.getMetaData().unwrap(ClickHouseResultSetMetaData.class); Method getColMethod = metaData.getClass().getDeclaredMethod("getCol", int.class); getColMethod.setAccessible(true); List primaryKeys = getPrimaryKeys(databaseName, tableName); TableSchema.Builder builder = TableSchema.builder(); for (int idx = 1; idx <= metaData.getColumnCount(); idx++) { ClickHouseColumnInfo columnInfo = (ClickHouseColumnInfo) getColMethod.invoke(metaData, idx); String columnName = columnInfo.getColumnName(); DataType columnType = ClickHouseTypeUtil.toFlinkType(columnInfo); if (primaryKeys.contains(columnName)) { columnType = columnType.notNull(); } builder.field(columnName, columnType); } if (!primaryKeys.isEmpty()) { builder.primaryKey(primaryKeys.toArray(new String[0])); } return builder.build(); } catch (Exception e) { throw new CatalogException( String.format( "Failed getting columns in catalog %s database %s table %s", getName(), databaseName, tableName), e); } } private List getPrimaryKeys(String databaseName, String tableName) { if (ignorePrimaryKey) { return Collections.emptyList(); } try (PreparedStatement stmt = connection.prepareStatement( String.format( "SELECT name from `system`.columns where `database` = '%s' and `table` = '%s' and is_in_primary_key = 1", databaseName, tableName)); ResultSet rs = stmt.executeQuery()) { List primaryKeys = new ArrayList<>(); while (rs.next()) { primaryKeys.add(rs.getString(1)); } return primaryKeys; } catch (Exception e) { throw new CatalogException( String.format( "Failed getting primary keys in catalog %s database %s table %s", getName(), databaseName, tableName), e); } } private List getPartitionKeys(String databaseName, String tableName) { try (PreparedStatement stmt = connection.prepareStatement( String.format( "SELECT name from `system`.columns where `database` = '%s' and `table` = '%s' and is_in_partition_key = 1", databaseName, tableName)); ResultSet rs = stmt.executeQuery()) { List partitionKeys = new ArrayList<>(); while (rs.next()) { partitionKeys.add(rs.getString(1)); } return partitionKeys; } catch (Exception e) { throw new CatalogException( String.format( "Failed getting partition keys of %s.%s.%s", getName(), databaseName, tableName), e); } } @Override public boolean tableExists(ObjectPath tablePath) throws CatalogException { try { return databaseExists(tablePath.getDatabaseName()) && listTables(tablePath.getDatabaseName()).contains(tablePath.getObjectName()); } catch (DatabaseNotExistException e) { return false; } } @Override public void dropTable(ObjectPath tablePath, boolean ignoreIfNotExists) throws TableNotExistException, CatalogException { throw new UnsupportedOperationException(); } @Override public void renameTable(ObjectPath tablePath, String newTableName, boolean ignoreIfNotExists) throws TableNotExistException, TableAlreadyExistException, CatalogException { throw new UnsupportedOperationException(); } @Override public void createTable(ObjectPath tablePath, CatalogBaseTable table, boolean ignoreIfExists) throws TableAlreadyExistException, DatabaseNotExistException, CatalogException { throw new UnsupportedOperationException(); } @Override public void alterTable( ObjectPath tablePath, CatalogBaseTable newTable, boolean ignoreIfNotExists) throws TableNotExistException, CatalogException { throw new UnsupportedOperationException(); } // ------------- partitions ------------- @Override public List listPartitions(ObjectPath tablePath) throws TableNotExistException, TableNotPartitionedException, CatalogException { return Collections.emptyList(); } @Override public List listPartitions( ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws TableNotExistException, TableNotPartitionedException, PartitionSpecInvalidException, CatalogException { return Collections.emptyList(); } @Override public List listPartitionsByFilter( ObjectPath tablePath, List filters) throws TableNotExistException, TableNotPartitionedException, CatalogException { return Collections.emptyList(); } @Override public CatalogPartition getPartition(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws PartitionNotExistException, CatalogException { throw new PartitionNotExistException(getName(), tablePath, partitionSpec); } @Override public boolean partitionExists(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws CatalogException { throw new UnsupportedOperationException(); } @Override public void createPartition( ObjectPath tablePath, CatalogPartitionSpec partitionSpec, CatalogPartition partition, boolean ignoreIfExists) throws TableNotExistException, TableNotPartitionedException, PartitionSpecInvalidException, PartitionAlreadyExistsException, CatalogException { throw new UnsupportedOperationException(); } @Override public void dropPartition( ObjectPath tablePath, CatalogPartitionSpec partitionSpec, boolean ignoreIfNotExists) throws PartitionNotExistException, CatalogException { throw new UnsupportedOperationException(); } @Override public void alterPartition( ObjectPath tablePath, CatalogPartitionSpec partitionSpec, CatalogPartition newPartition, boolean ignoreIfNotExists) throws PartitionNotExistException, CatalogException { throw new UnsupportedOperationException(); } // ------------- functions ------------- @Override public List listFunctions(String dbName) throws DatabaseNotExistException, CatalogException { return Collections.emptyList(); } @Override public CatalogFunction getFunction(ObjectPath functionPath) throws FunctionNotExistException, CatalogException { throw new FunctionNotExistException(getName(), functionPath); } @Override public boolean functionExists(ObjectPath functionPath) throws CatalogException { return false; } @Override public void createFunction( ObjectPath functionPath, CatalogFunction function, boolean ignoreIfExists) throws FunctionAlreadyExistException, DatabaseNotExistException, CatalogException { throw new UnsupportedOperationException(); } @Override public void alterFunction( ObjectPath functionPath, CatalogFunction newFunction, boolean ignoreIfNotExists) throws FunctionNotExistException, CatalogException { throw new UnsupportedOperationException(); } @Override public void dropFunction(ObjectPath functionPath, boolean ignoreIfNotExists) throws FunctionNotExistException, CatalogException { throw new UnsupportedOperationException(); } // ------------- statistics ------------- @Override public CatalogTableStatistics getTableStatistics(ObjectPath tablePath) throws TableNotExistException, CatalogException { return CatalogTableStatistics.UNKNOWN; } @Override public CatalogColumnStatistics getTableColumnStatistics(ObjectPath tablePath) throws TableNotExistException, CatalogException { return CatalogColumnStatistics.UNKNOWN; } @Override public CatalogTableStatistics getPartitionStatistics( ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws PartitionNotExistException, CatalogException { return CatalogTableStatistics.UNKNOWN; } @Override public CatalogColumnStatistics getPartitionColumnStatistics( ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws PartitionNotExistException, CatalogException { return CatalogColumnStatistics.UNKNOWN; } @Override public void alterTableStatistics( ObjectPath tablePath, CatalogTableStatistics tableStatistics, boolean ignoreIfNotExists) throws TableNotExistException, CatalogException { throw new UnsupportedOperationException(); } @Override public void alterTableColumnStatistics( ObjectPath tablePath, CatalogColumnStatistics columnStatistics, boolean ignoreIfNotExists) throws TableNotExistException, CatalogException, TablePartitionedException { throw new UnsupportedOperationException(); } @Override public void alterPartitionStatistics( ObjectPath tablePath, CatalogPartitionSpec partitionSpec, CatalogTableStatistics partitionStatistics, boolean ignoreIfNotExists) throws PartitionNotExistException, CatalogException { throw new UnsupportedOperationException(); } @Override public void alterPartitionColumnStatistics( ObjectPath tablePath, CatalogPartitionSpec partitionSpec, CatalogColumnStatistics columnStatistics, boolean ignoreIfNotExists) throws PartitionNotExistException, CatalogException { throw new UnsupportedOperationException(); } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/catalog/ClickHouseCatalogFactory.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.catalog; import org.apache.flink.configuration.ConfigOption; import org.apache.flink.configuration.Configuration; import org.apache.flink.table.catalog.Catalog; import org.apache.flink.table.factories.CatalogFactory; import org.apache.flink.table.factories.FactoryUtil; import java.util.HashSet; import java.util.Set; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfig.IDENTIFIER; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.CATALOG_IGNORE_PRIMARY_KEY; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.DATABASE_NAME; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.PASSWORD; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SCAN_PARTITION_COLUMN; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SCAN_PARTITION_LOWER_BOUND; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SCAN_PARTITION_NUM; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SCAN_PARTITION_UPPER_BOUND; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SINK_BATCH_SIZE; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SINK_FLUSH_INTERVAL; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SINK_IGNORE_DELETE; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SINK_MAX_RETRIES; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SINK_PARTITION_KEY; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SINK_PARTITION_STRATEGY; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.URL; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.USERNAME; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.USE_LOCAL; import static org.apache.flink.table.factories.FactoryUtil.PROPERTY_VERSION; /** Factory for {@link ClickHouseCatalog}. */ public class ClickHouseCatalogFactory implements CatalogFactory { @Override public String factoryIdentifier() { return IDENTIFIER; } @Override public Set> requiredOptions() { final Set> options = new HashSet<>(); options.add(URL); options.add(USERNAME); options.add(PASSWORD); return options; } @Override public Set> optionalOptions() { final Set> options = new HashSet<>(); options.add(PROPERTY_VERSION); options.add(DATABASE_NAME); options.add(USE_LOCAL); options.add(CATALOG_IGNORE_PRIMARY_KEY); options.add(SINK_BATCH_SIZE); options.add(SINK_FLUSH_INTERVAL); options.add(SINK_MAX_RETRIES); options.add(SINK_PARTITION_STRATEGY); options.add(SINK_PARTITION_KEY); options.add(SINK_IGNORE_DELETE); options.add(SCAN_PARTITION_COLUMN); options.add(SCAN_PARTITION_NUM); options.add(SCAN_PARTITION_LOWER_BOUND); options.add(SCAN_PARTITION_UPPER_BOUND); return options; } @Override public Catalog createCatalog(Context context) { final FactoryUtil.CatalogFactoryHelper helper = FactoryUtil.createCatalogFactoryHelper(this, context); helper.validate(); return new ClickHouseCatalog( context.getName(), helper.getOptions().get(DATABASE_NAME), helper.getOptions().get(URL), helper.getOptions().get(USERNAME), helper.getOptions().get(PASSWORD), ((Configuration) helper.getOptions()).toMap()); } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/config/ClickHouseConfig.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.config; /** clickhouse config properties. */ public class ClickHouseConfig { public static final String IDENTIFIER = "clickhouse"; public static final String PROPERTIES_PREFIX = "properties."; public static final String URL = "url"; public static final String USERNAME = "username"; public static final String PASSWORD = "password"; public static final String DATABASE_NAME = "database-name"; public static final String TABLE_NAME = "table-name"; public static final String USE_LOCAL = "use-local"; public static final String SINK_BATCH_SIZE = "sink.batch-size"; public static final String SINK_FLUSH_INTERVAL = "sink.flush-interval"; public static final String SINK_MAX_RETRIES = "sink.max-retries"; @Deprecated public static final String SINK_WRITE_LOCAL = "sink.write-local"; public static final String SINK_PARTITION_STRATEGY = "sink.partition-strategy"; public static final String SINK_PARTITION_KEY = "sink.partition-key"; public static final String SINK_IGNORE_DELETE = "sink.ignore-delete"; public static final String CATALOG_IGNORE_PRIMARY_KEY = "catalog.ignore-primary-key"; public static final String SCAN_PARTITION_COLUMN = "scan.partition.column"; public static final String SCAN_PARTITION_NUM = "scan.partition.num"; public static final String SCAN_PARTITION_LOWER_BOUND = "scan.partition.lower-bound"; public static final String SCAN_PARTITION_UPPER_BOUND = "scan.partition.upper-bound"; } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/config/ClickHouseConfigOptions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.config; import org.apache.flink.configuration.ConfigOption; import org.apache.flink.configuration.ConfigOptions; import org.apache.flink.table.catalog.CommonCatalogOptions; import java.time.Duration; /** clickhouse config options. */ public class ClickHouseConfigOptions { public static final ConfigOption URL = ConfigOptions.key(ClickHouseConfig.URL) .stringType() .noDefaultValue() .withDescription("The ClickHouse url in format `clickhouse://:`."); public static final ConfigOption USERNAME = ConfigOptions.key(ClickHouseConfig.USERNAME) .stringType() .noDefaultValue() .withDescription("The ClickHouse username."); public static final ConfigOption PASSWORD = ConfigOptions.key(ClickHouseConfig.PASSWORD) .stringType() .noDefaultValue() .withDescription("The ClickHouse password."); public static final ConfigOption DATABASE_NAME = ConfigOptions.key(ClickHouseConfig.DATABASE_NAME) .stringType() .defaultValue("default") .withDescription("The ClickHouse database name. Default to `default`."); public static final ConfigOption DEFAULT_DATABASE = ConfigOptions.key(CommonCatalogOptions.DEFAULT_DATABASE_KEY) .stringType() .noDefaultValue() .withDescription("The ClickHouse default database name."); public static final ConfigOption TABLE_NAME = ConfigOptions.key(ClickHouseConfig.TABLE_NAME) .stringType() .noDefaultValue() .withDescription("The ClickHouse table name."); public static final ConfigOption USE_LOCAL = ConfigOptions.key(ClickHouseConfig.USE_LOCAL) .booleanType() .defaultValue(false) .withDescription( "Directly read/write local tables in case of distributed table engine."); public static final ConfigOption SINK_BATCH_SIZE = ConfigOptions.key(ClickHouseConfig.SINK_BATCH_SIZE) .intType() .defaultValue(1000) .withDescription( "The max flush size, over this number of records, will flush data. The default value is 1000."); public static final ConfigOption SINK_FLUSH_INTERVAL = ConfigOptions.key(ClickHouseConfig.SINK_FLUSH_INTERVAL) .durationType() .defaultValue(Duration.ofSeconds(1L)) .withDescription( "The flush interval mills, over this time, asynchronous threads will flush data. The default value is 1s."); public static final ConfigOption SINK_MAX_RETRIES = ConfigOptions.key(ClickHouseConfig.SINK_MAX_RETRIES) .intType() .defaultValue(3) .withDescription("The max retry times if writing records to database failed."); @Deprecated public static final ConfigOption SINK_WRITE_LOCAL = ConfigOptions.key(ClickHouseConfig.SINK_WRITE_LOCAL) .booleanType() .defaultValue(false) .withDescription( "Directly write to local tables in case of distributed table."); public static final ConfigOption SINK_PARTITION_STRATEGY = ConfigOptions.key(ClickHouseConfig.SINK_PARTITION_STRATEGY) .stringType() .defaultValue("balanced") .withDescription("Partition strategy, available: balanced, hash, shuffle."); public static final ConfigOption SINK_PARTITION_KEY = ConfigOptions.key(ClickHouseConfig.SINK_PARTITION_KEY) .stringType() .noDefaultValue() .withDescription("Partition key used for hash strategy."); public static final ConfigOption SINK_IGNORE_DELETE = ConfigOptions.key(ClickHouseConfig.SINK_IGNORE_DELETE) .booleanType() .defaultValue(true) .withDescription( "Whether to treat update statements as insert statements and ignore deletes. defaults to true."); public static final ConfigOption CATALOG_IGNORE_PRIMARY_KEY = ConfigOptions.key(ClickHouseConfig.CATALOG_IGNORE_PRIMARY_KEY) .booleanType() .defaultValue(true) .withDescription( "Whether to ignore primary keys when using ClickHouseCatalog to create table. defaults to true."); public static final ConfigOption SCAN_PARTITION_COLUMN = ConfigOptions.key(ClickHouseConfig.SCAN_PARTITION_COLUMN) .stringType() .noDefaultValue() .withDescription("The column name used for partitioning the input."); public static final ConfigOption SCAN_PARTITION_NUM = ConfigOptions.key(ClickHouseConfig.SCAN_PARTITION_NUM) .intType() .noDefaultValue() .withDescription("The number of partitions."); public static final ConfigOption SCAN_PARTITION_LOWER_BOUND = ConfigOptions.key(ClickHouseConfig.SCAN_PARTITION_LOWER_BOUND) .longType() .noDefaultValue() .withDescription("The smallest value of the first partition."); public static final ConfigOption SCAN_PARTITION_UPPER_BOUND = ConfigOptions.key(ClickHouseConfig.SCAN_PARTITION_UPPER_BOUND) .longType() .noDefaultValue() .withDescription("The largest value of the last partition."); } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/AbstractClickHouseInputFormat.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.internal; import org.apache.flink.api.common.io.DefaultInputSplitAssigner; import org.apache.flink.api.common.io.RichInputFormat; import org.apache.flink.api.common.io.statistics.BaseStatistics; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.java.typeutils.ResultTypeQueryable; import org.apache.flink.configuration.Configuration; import org.apache.flink.connector.clickhouse.internal.common.DistributedEngineFullSchema; import org.apache.flink.connector.clickhouse.internal.connection.ClickHouseConnectionProvider; import org.apache.flink.connector.clickhouse.internal.converter.ClickHouseRowConverter; import org.apache.flink.connector.clickhouse.internal.options.ClickHouseReadOptions; import org.apache.flink.connector.clickhouse.split.ClickHouseParametersProvider; import org.apache.flink.core.io.GenericInputSplit; import org.apache.flink.core.io.InputSplit; import org.apache.flink.core.io.InputSplitAssigner; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.RowType; import org.apache.flink.util.Preconditions; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; import static org.apache.flink.connector.clickhouse.util.ClickHouseUtil.getAndParseDistributedEngineSchema; /** Abstract Clickhouse input format. */ public abstract class AbstractClickHouseInputFormat extends RichInputFormat implements ResultTypeQueryable { protected final String[] fieldNames; protected final TypeInformation rowDataTypeInfo; protected final Object[][] parameterValues; protected final String parameterClause; protected final String filterClause; protected final long limit; protected AbstractClickHouseInputFormat( String[] fieldNames, TypeInformation rowDataTypeInfo, Object[][] parameterValues, String parameterClause, String filterClause, long limit) { this.fieldNames = fieldNames; this.rowDataTypeInfo = rowDataTypeInfo; this.parameterValues = parameterValues; this.parameterClause = parameterClause; this.filterClause = filterClause; this.limit = limit; } @Override public void configure(Configuration parameters) {} @Override public BaseStatistics getStatistics(BaseStatistics cachedStatistics) { return cachedStatistics; } @Override public TypeInformation getProducedType() { return rowDataTypeInfo; } @Override public InputSplitAssigner getInputSplitAssigner(InputSplit[] inputSplits) { return new DefaultInputSplitAssigner(inputSplits); } protected InputSplit[] createGenericInputSplits(int splitNum) { GenericInputSplit[] ret = new GenericInputSplit[splitNum]; for (int i = 0; i < ret.length; i++) { ret[i] = new GenericInputSplit(i, ret.length); } return ret; } protected String getQuery(String table, String database) { String queryTemplate = ClickHouseStatementFactory.getSelectStatement(table, database, fieldNames); StringBuilder whereBuilder = new StringBuilder(); if (filterClause != null) { if (filterClause.toLowerCase().contains(" or ")) { whereBuilder.append("(").append(filterClause).append(")"); } else { whereBuilder.append(filterClause); } } if (parameterClause != null) { if (whereBuilder.length() > 0) { whereBuilder.append(" AND "); } whereBuilder.append(parameterClause); } String limitClause = ""; if (limit >= 0) { limitClause = "LIMIT " + limit; } return whereBuilder.length() > 0 ? String.join(" ", queryTemplate, "WHERE", whereBuilder.toString(), limitClause) : String.join(" ", queryTemplate, limitClause); } /** Builder. */ public static class Builder { private ClickHouseReadOptions readOptions; private Properties connectionProperties; private DistributedEngineFullSchema engineFullSchema; private Map shardMap; private Object[][] shardValues; private String[] fieldNames; private DataType[] fieldTypes; private TypeInformation rowDataTypeInfo; private Object[][] parameterValues; private String parameterClause; private String filterClause; private long limit; public Builder withOptions(ClickHouseReadOptions readOptions) { this.readOptions = readOptions; return this; } public Builder withConnectionProperties(Properties connectionProperties) { this.connectionProperties = connectionProperties; return this; } public Builder withFieldNames(String[] fieldNames) { this.fieldNames = fieldNames; return this; } public Builder withFieldTypes(DataType[] fieldTypes) { this.fieldTypes = fieldTypes; return this; } public Builder withRowDataTypeInfo(TypeInformation rowDataTypeInfo) { this.rowDataTypeInfo = rowDataTypeInfo; return this; } public Builder withFilterClause(String filterClause) { this.filterClause = filterClause; return this; } public Builder withLimit(long limit) { this.limit = limit; return this; } public AbstractClickHouseInputFormat build() { Preconditions.checkNotNull(readOptions); Preconditions.checkNotNull(connectionProperties); Preconditions.checkNotNull(fieldNames); Preconditions.checkNotNull(fieldTypes); Preconditions.checkNotNull(rowDataTypeInfo); int[] shardIds = null; if (readOptions.isUseLocal()) { shardIds = initShardInfo(); } if (readOptions.isUseLocal() || readOptions.getPartitionColumn() != null) { initPartitionInfo(shardIds); } LogicalType[] logicalTypes = Arrays.stream(fieldTypes) .map(DataType::getLogicalType) .toArray(LogicalType[]::new); return readOptions.isUseLocal() ? createShardInputFormat(logicalTypes) : createBatchOutputFormat(logicalTypes); } private int[] initShardInfo() { ClickHouseConnectionProvider connectionProvider = null; try { connectionProvider = new ClickHouseConnectionProvider(readOptions, connectionProperties); engineFullSchema = getAndParseDistributedEngineSchema( connectionProvider.getOrCreateConnection(), readOptions.getDatabaseName(), readOptions.getTableName()); if (engineFullSchema == null) { throw new RuntimeException( String.format( "table `%s`.`%s` is not a Distributed table", readOptions.getDatabaseName(), readOptions.getTableName())); } List shardUrls = connectionProvider.getShardUrls(engineFullSchema.getCluster()); if (!shardUrls.isEmpty()) { int len = shardUrls.size(); int[] dataIds = new int[len]; shardMap = new HashMap<>(len); for (int i = 0; i < len; i++) { shardMap.put(i, shardUrls.get(i)); dataIds[i] = i; } return dataIds; } } catch (Exception exception) { throw new RuntimeException("Get shard table info failed.", exception); } finally { if (connectionProvider != null) { connectionProvider.closeConnections(); } } return null; } private void initPartitionInfo(int[] shardIds) { try { ClickHouseParametersProvider parametersProvider = new ClickHouseParametersProvider.Builder() .setMinVal(readOptions.getPartitionLowerBound()) .setMaxVal(readOptions.getPartitionUpperBound()) .setBatchNum(readOptions.getPartitionNum()) .setUseLocal(readOptions.isUseLocal()) .setShardIds(shardIds) .build(); this.parameterValues = parametersProvider.getParameterValues(); String parameterClause = parametersProvider.getParameterClause(); if (parameterClause != null) { this.parameterClause = String.format( parametersProvider.getParameterClause(), readOptions.getPartitionColumn()); } this.shardValues = parametersProvider.getShardIdValues(); } catch (Exception exception) { throw new RuntimeException("Init partition failed.", exception); } } private AbstractClickHouseInputFormat createShardInputFormat(LogicalType[] logicalTypes) { return new ClickHouseShardInputFormat( new ClickHouseConnectionProvider(readOptions, connectionProperties), new ClickHouseRowConverter(RowType.of(logicalTypes)), readOptions, engineFullSchema, shardMap, shardValues, fieldNames, rowDataTypeInfo, parameterValues, parameterClause, filterClause, limit); } private AbstractClickHouseInputFormat createBatchOutputFormat(LogicalType[] logicalTypes) { return new ClickHouseBatchInputFormat( new ClickHouseConnectionProvider(readOptions, connectionProperties), new ClickHouseRowConverter(RowType.of(logicalTypes)), readOptions, fieldNames, rowDataTypeInfo, parameterValues, parameterClause, filterClause, limit); } } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/AbstractClickHouseOutputFormat.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.internal; import org.apache.flink.api.common.io.RichOutputFormat; import org.apache.flink.configuration.Configuration; import org.apache.flink.connector.clickhouse.internal.connection.ClickHouseConnectionProvider; import org.apache.flink.connector.clickhouse.internal.executor.ClickHouseExecutor; import org.apache.flink.connector.clickhouse.internal.options.ClickHouseDmlOptions; import org.apache.flink.connector.clickhouse.internal.partitioner.ClickHousePartitioner; import org.apache.flink.table.catalog.UniqueConstraint; import org.apache.flink.table.data.RowData; import org.apache.flink.table.data.RowData.FieldGetter; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.util.Preconditions; import org.apache.flink.util.concurrent.ExecutorThreadFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.Flushable; import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.TimeUnit; /** Abstract class of ClickHouse output format. */ public abstract class AbstractClickHouseOutputFormat extends RichOutputFormat implements Flushable { private static final long serialVersionUID = 1L; private static final Logger LOG = LoggerFactory.getLogger(AbstractClickHouseOutputFormat.class); protected transient volatile boolean closed = false; protected transient ScheduledExecutorService scheduler; protected transient ScheduledFuture scheduledFuture; protected transient volatile Exception flushException; public AbstractClickHouseOutputFormat() {} @Override public void configure(Configuration parameters) {} public void scheduledFlush(long intervalMillis, String executorName) { Preconditions.checkArgument(intervalMillis > 0, "flush interval must be greater than 0"); scheduler = new ScheduledThreadPoolExecutor(1, new ExecutorThreadFactory(executorName)); scheduledFuture = scheduler.scheduleWithFixedDelay( () -> { synchronized (this) { if (!closed) { try { flush(); } catch (Exception e) { flushException = e; } } } }, intervalMillis, intervalMillis, TimeUnit.MILLISECONDS); } public void checkBeforeFlush(final ClickHouseExecutor executor) throws IOException { checkFlushException(); try { executor.executeBatch(); } catch (Exception e) { throw new IOException(e); } } @Override public synchronized void close() { if (!closed) { closed = true; try { flush(); } catch (Exception exception) { LOG.warn("Flushing records to ClickHouse failed.", exception); } if (scheduledFuture != null) { scheduledFuture.cancel(false); this.scheduler.shutdown(); } closeOutputFormat(); checkFlushException(); } } protected void checkFlushException() { if (flushException != null) { throw new RuntimeException("Flush exception found.", flushException); } } protected abstract void closeOutputFormat(); /** Builder for {@link ClickHouseBatchOutputFormat} and {@link ClickHouseShardOutputFormat}. */ public static class Builder { private static final Logger LOG = LoggerFactory.getLogger(AbstractClickHouseOutputFormat.Builder.class); private DataType[] fieldDataTypes; private ClickHouseDmlOptions options; private String[] fieldNames; private UniqueConstraint primaryKey; private List partitionKeys; public Builder() {} public AbstractClickHouseOutputFormat.Builder withOptions(ClickHouseDmlOptions options) { this.options = options; return this; } public AbstractClickHouseOutputFormat.Builder withFieldDataTypes( DataType[] fieldDataTypes) { this.fieldDataTypes = fieldDataTypes; return this; } public AbstractClickHouseOutputFormat.Builder withFieldNames(String[] fieldNames) { this.fieldNames = fieldNames; return this; } public AbstractClickHouseOutputFormat.Builder withPrimaryKey(UniqueConstraint primaryKey) { this.primaryKey = primaryKey; return this; } public AbstractClickHouseOutputFormat.Builder withPartitionKey(List partitionKeys) { this.partitionKeys = partitionKeys; return this; } public AbstractClickHouseOutputFormat build() { Preconditions.checkNotNull(options); Preconditions.checkNotNull(fieldNames); Preconditions.checkNotNull(fieldDataTypes); LogicalType[] logicalTypes = Arrays.stream(fieldDataTypes) .map(DataType::getLogicalType) .toArray(LogicalType[]::new); if (primaryKey != null) { LOG.warn("If primary key is specified, connector will be in UPSERT mode."); LOG.warn( "The data will be updated / deleted by the primary key, you will have significant performance loss."); } return options.isUseLocal() ? createShardOutputFormat(logicalTypes) : createBatchOutputFormat(logicalTypes); } private ClickHouseBatchOutputFormat createBatchOutputFormat(LogicalType[] logicalTypes) { String[] keyFields = new String[0]; if (primaryKey != null) { keyFields = listToStringArray(primaryKey.getColumns()); } return new ClickHouseBatchOutputFormat( new ClickHouseConnectionProvider(options), fieldNames, keyFields, listToStringArray(partitionKeys), logicalTypes, options); } private ClickHouseShardOutputFormat createShardOutputFormat(LogicalType[] logicalTypes) { String partitionStrategy = options.getPartitionStrategy(); ClickHousePartitioner partitioner; switch (partitionStrategy) { case ClickHousePartitioner.BALANCED: partitioner = ClickHousePartitioner.createBalanced(); break; case ClickHousePartitioner.SHUFFLE: partitioner = ClickHousePartitioner.createShuffle(); break; case ClickHousePartitioner.HASH: int index = Arrays.asList(fieldNames).indexOf(options.getPartitionKey()); if (index == -1) { throw new IllegalArgumentException( String.format( "Partition key `%s` not found in table schema", options.getPartitionKey())); } FieldGetter getter = RowData.createFieldGetter(logicalTypes[index], index); partitioner = ClickHousePartitioner.createHash(getter); break; default: throw new IllegalArgumentException( String.format( "Unknown sink.partition-strategy `%s`", partitionStrategy)); } String[] keyFields = new String[0]; if (primaryKey != null) { keyFields = listToStringArray(primaryKey.getColumns()); } return new ClickHouseShardOutputFormat( new ClickHouseConnectionProvider(options), fieldNames, keyFields, listToStringArray(partitionKeys), logicalTypes, partitioner, options); } private String[] listToStringArray(List list) { if (list == null) { return new String[0]; } else { return list.toArray(new String[0]); } } } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/ClickHouseBatchInputFormat.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.internal; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.connector.clickhouse.internal.connection.ClickHouseConnectionProvider; import org.apache.flink.connector.clickhouse.internal.converter.ClickHouseRowConverter; import org.apache.flink.connector.clickhouse.internal.options.ClickHouseReadOptions; import org.apache.flink.core.io.InputSplit; import org.apache.flink.table.data.RowData; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import ru.yandex.clickhouse.ClickHouseConnection; import java.io.IOException; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; /** ClickHouse batch input format. */ public class ClickHouseBatchInputFormat extends AbstractClickHouseInputFormat { private static final Logger LOG = LoggerFactory.getLogger(ClickHouseBatchOutputFormat.class); private final ClickHouseConnectionProvider connectionProvider; private final ClickHouseRowConverter rowConverter; private final ClickHouseReadOptions readOptions; private transient PreparedStatement statement; private transient ResultSet resultSet; private transient boolean hasNext; public ClickHouseBatchInputFormat( ClickHouseConnectionProvider connectionProvider, ClickHouseRowConverter rowConverter, ClickHouseReadOptions readOptions, String[] fieldNames, TypeInformation rowDataTypeInfo, Object[][] parameterValues, String parameterClause, String filterClause, long limit) { super(fieldNames, rowDataTypeInfo, parameterValues, parameterClause, filterClause, limit); this.connectionProvider = connectionProvider; this.rowConverter = rowConverter; this.readOptions = readOptions; } @Override public void openInputFormat() { try { ClickHouseConnection connection = connectionProvider.getOrCreateConnection(); String query = getQuery(readOptions.getTableName(), readOptions.getDatabaseName()); statement = connection.prepareStatement(query); } catch (SQLException se) { throw new IllegalArgumentException("open() failed." + se.getMessage(), se); } } @Override public void closeInputFormat() { try { if (statement != null) { statement.close(); } } catch (SQLException exception) { LOG.info("InputFormat Statement couldn't be closed.", exception); } finally { statement = null; } if (connectionProvider != null) { connectionProvider.closeConnections(); } } @Override public void open(InputSplit split) { try { if (split != null && parameterValues != null) { for (int i = 0; i < parameterValues[split.getSplitNumber()].length; i++) { Object param = parameterValues[split.getSplitNumber()][i]; statement.setObject(i + 1, param); } } resultSet = statement.executeQuery(); hasNext = resultSet.next(); } catch (SQLException se) { throw new IllegalArgumentException("open() failed." + se.getMessage(), se); } } @Override public void close() { try { if (resultSet != null) { resultSet.close(); } } catch (SQLException se) { LOG.info("InputFormat ResultSet couldn't be closed.", se); } } @Override public boolean reachedEnd() { return !hasNext; } @Override public RowData nextRecord(RowData reuse) throws IOException { if (!hasNext) { return null; } try { RowData row = rowConverter.toInternal(resultSet); // update hasNext after we've read the record hasNext = resultSet.next(); return row; } catch (Exception exception) { throw new IOException("Couldn't read data from resultSet.", exception); } } @Override public InputSplit[] createInputSplits(int minNumSplits) { int splitNum = parameterValues != null ? parameterValues.length : 1; return createGenericInputSplits(splitNum); } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/ClickHouseBatchOutputFormat.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.internal; import org.apache.flink.connector.clickhouse.internal.connection.ClickHouseConnectionProvider; import org.apache.flink.connector.clickhouse.internal.executor.ClickHouseExecutor; import org.apache.flink.connector.clickhouse.internal.options.ClickHouseDmlOptions; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.util.Preconditions; import javax.annotation.Nonnull; import java.io.IOException; import java.sql.SQLException; /** Output data to ClickHouse local table. */ public class ClickHouseBatchOutputFormat extends AbstractClickHouseOutputFormat { private static final long serialVersionUID = 1L; private final ClickHouseConnectionProvider connectionProvider; private final String[] fieldNames; private final String[] keyFields; private final String[] partitionFields; private final LogicalType[] fieldTypes; private final ClickHouseDmlOptions options; private transient ClickHouseExecutor executor; private transient int batchCount = 0; protected ClickHouseBatchOutputFormat( @Nonnull ClickHouseConnectionProvider connectionProvider, @Nonnull String[] fieldNames, @Nonnull String[] keyFields, @Nonnull String[] partitionFields, @Nonnull LogicalType[] fieldTypes, @Nonnull ClickHouseDmlOptions options) { this.connectionProvider = Preconditions.checkNotNull(connectionProvider); this.fieldNames = Preconditions.checkNotNull(fieldNames); this.keyFields = Preconditions.checkNotNull(keyFields); this.partitionFields = Preconditions.checkNotNull(partitionFields); this.fieldTypes = Preconditions.checkNotNull(fieldTypes); this.options = Preconditions.checkNotNull(options); } @Override public void open(int taskNumber, int numTasks) throws IOException { try { // TODO Distributed tables don't support update and delete statements. executor = ClickHouseExecutor.createClickHouseExecutor( options.getTableName(), options.getDatabaseName(), null, fieldNames, keyFields, partitionFields, fieldTypes, options); executor.prepareStatement(connectionProvider); executor.setRuntimeContext(getRuntimeContext()); long flushIntervalMillis = options.getFlushInterval().toMillis(); scheduledFlush(flushIntervalMillis, "clickhouse-batch-output-format"); } catch (Exception exception) { throw new IOException("Unable to establish connection with ClickHouse.", exception); } } @Override public synchronized void writeRecord(RowData record) throws IOException { checkFlushException(); try { executor.addToBatch(record); batchCount++; if (batchCount >= options.getBatchSize()) { flush(); } } catch (SQLException exception) { throw new IOException("Writing record to ClickHouse statement failed.", exception); } } @Override public synchronized void flush() throws IOException { if (batchCount > 0) { checkBeforeFlush(executor); batchCount = 0; } } @Override public synchronized void closeOutputFormat() { executor.closeStatement(); connectionProvider.closeConnections(); } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/ClickHouseShardInputFormat.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.internal; import org.apache.flink.annotation.Experimental; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.configuration.Configuration; import org.apache.flink.connector.clickhouse.internal.common.DistributedEngineFullSchema; import org.apache.flink.connector.clickhouse.internal.connection.ClickHouseConnectionProvider; import org.apache.flink.connector.clickhouse.internal.converter.ClickHouseRowConverter; import org.apache.flink.connector.clickhouse.internal.options.ClickHouseReadOptions; import org.apache.flink.core.io.InputSplit; import org.apache.flink.table.data.RowData; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import ru.yandex.clickhouse.ClickHouseConnection; import java.io.IOException; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import java.util.Map; /** ClickHouse shard input format. */ @Experimental public class ClickHouseShardInputFormat extends AbstractClickHouseInputFormat { private static final Logger LOG = LoggerFactory.getLogger(ClickHouseShardInputFormat.class); private final ClickHouseConnectionProvider connectionProvider; private final ClickHouseRowConverter rowConverter; private final ClickHouseReadOptions readOptions; private final DistributedEngineFullSchema engineFullSchema; private final Map shardMap; private final Object[][] shardValues; private transient List statements; private transient List resultSets; private transient boolean hasNext; private transient int rsIndex = -1; public ClickHouseShardInputFormat( ClickHouseConnectionProvider connectionProvider, ClickHouseRowConverter rowConverter, ClickHouseReadOptions readOptions, DistributedEngineFullSchema engineFullSchema, Map shardMap, Object[][] shardValues, String[] fieldNames, TypeInformation rowDataTypeInfo, Object[][] parameterValues, String parameterClause, String filterClause, long limit) { super(fieldNames, rowDataTypeInfo, parameterValues, parameterClause, filterClause, limit); this.connectionProvider = connectionProvider; this.rowConverter = rowConverter; this.readOptions = readOptions; this.engineFullSchema = engineFullSchema; this.shardMap = shardMap; this.shardValues = shardValues; } @Override public void configure(Configuration parameters) { super.configure(parameters); this.statements = new ArrayList<>(); this.resultSets = new ArrayList<>(); } @Override public void openInputFormat() {} @Override public void closeInputFormat() { if (connectionProvider != null) { connectionProvider.closeConnections(); } } @Override public void open(InputSplit split) { try { Object[] shardIds = shardValues[split.getSplitNumber()]; for (int i = 0; i < shardIds.length; i++) { // PreparedStatement. String shardUrl = shardMap.get((Integer) shardIds[i]); ClickHouseConnection connection = connectionProvider.createAndStoreShardConnection( shardUrl, engineFullSchema.getDatabase()); String query = getQuery(engineFullSchema.getTable(), engineFullSchema.getDatabase()); PreparedStatement statement = connection.prepareStatement(query); statements.add(i, statement); // ResultSet. if (parameterValues != null) { Object[] parameters = parameterValues[split.getSplitNumber()]; for (int j = 0; j < parameters.length; j++) { statement.setObject(j + 1, parameters[j]); } } if (i == 0) { ResultSet resultSet = statement.executeQuery(); resultSets.add(i, resultSet); hasNext = resultSet.next(); } } rsIndex = 0; } catch (SQLException se) { throw new IllegalArgumentException("open() failed." + se.getMessage(), se); } } @Override public void close() { for (ResultSet resultSet : resultSets) { try { if (resultSet != null) { resultSet.close(); } } catch (SQLException se) { LOG.info("InputFormat ResultSet couldn't be closed.", se); } } for (PreparedStatement statement : statements) { try { if (statement != null) { statement.close(); } } catch (SQLException se) { LOG.info("InputFormat Statement couldn't be closed.", se); } } resultSets.clear(); statements.clear(); } @Override public boolean reachedEnd() { final int maxIndex = statements.size() - 1; return !hasNext && rsIndex == maxIndex; } @Override public RowData nextRecord(RowData reuse) throws IOException { if (!hasNext && !nextValidResultSet()) { return null; } try { ResultSet resultSet = resultSets.get(rsIndex); RowData row = rowConverter.toInternal(resultSet); // update hasNext after we've read the record hasNext = resultSet.next(); return row; } catch (Exception exception) { throw new IOException("Couldn't read data from resultSet.", exception); } } private boolean nextValidResultSet() { while (++rsIndex < statements.size()) { try { PreparedStatement statement = statements.get(rsIndex); ResultSet resultSet = statement.executeQuery(); resultSets.add(rsIndex, resultSet); hasNext = resultSet.next(); if (hasNext) { return true; } } catch (SQLException e) { throw new RuntimeException("Execute query failed, rsIndex = " + rsIndex); } } return false; } @Override public InputSplit[] createInputSplits(int minNumSplits) { return createGenericInputSplits(shardValues.length); } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/ClickHouseShardOutputFormat.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.internal; import org.apache.flink.connector.clickhouse.internal.common.DistributedEngineFullSchema; import org.apache.flink.connector.clickhouse.internal.connection.ClickHouseConnectionProvider; import org.apache.flink.connector.clickhouse.internal.executor.ClickHouseExecutor; import org.apache.flink.connector.clickhouse.internal.options.ClickHouseDmlOptions; import org.apache.flink.connector.clickhouse.internal.partitioner.ClickHousePartitioner; import org.apache.flink.connector.clickhouse.util.ClickHouseUtil; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.util.Preconditions; import ru.yandex.clickhouse.ClickHouseConnection; import javax.annotation.Nonnull; import java.io.IOException; import java.util.ArrayList; import java.util.List; /** * The shard output format of distributed table.
* TODO: Use ClickHouse's sharding key to distribute data to different instances. */ public class ClickHouseShardOutputFormat extends AbstractClickHouseOutputFormat { private static final long serialVersionUID = 1L; private final ClickHouseConnectionProvider connectionProvider; private final String[] fieldNames; private final LogicalType[] logicalTypes; private final ClickHousePartitioner partitioner; private final ClickHouseDmlOptions options; private final List shardExecutors; private final boolean ignoreDelete; private final String[] keyFields; private final String[] partitionFields; private transient int[] batchCounts; protected ClickHouseShardOutputFormat( @Nonnull ClickHouseConnectionProvider connectionProvider, @Nonnull String[] fieldNames, @Nonnull String[] keyFields, @Nonnull String[] partitionFields, @Nonnull LogicalType[] logicalTypes, @Nonnull ClickHousePartitioner partitioner, @Nonnull ClickHouseDmlOptions options) { this.connectionProvider = Preconditions.checkNotNull(connectionProvider); this.fieldNames = Preconditions.checkNotNull(fieldNames); this.keyFields = keyFields; this.partitionFields = partitionFields; this.logicalTypes = Preconditions.checkNotNull(logicalTypes); this.partitioner = Preconditions.checkNotNull(partitioner); this.options = Preconditions.checkNotNull(options); this.shardExecutors = new ArrayList<>(); this.ignoreDelete = options.getIgnoreDelete(); } @Override public void open(int taskNumber, int numTasks) throws IOException { try { // Get the local table of distributed table. DistributedEngineFullSchema shardTableSchema = ClickHouseUtil.getAndParseDistributedEngineSchema( connectionProvider.getOrCreateConnection(), options.getDatabaseName(), options.getTableName()); if (shardTableSchema == null) { throw new RuntimeException( String.format( "table `%s`.`%s` is not a Distributed table", options.getDatabaseName(), options.getTableName())); } List shardConnections = connectionProvider.createShardConnections( shardTableSchema.getCluster(), shardTableSchema.getDatabase()); for (ClickHouseConnection shardConnection : shardConnections) { ClickHouseExecutor executor = ClickHouseExecutor.createClickHouseExecutor( shardTableSchema.getTable(), shardTableSchema.getDatabase(), shardTableSchema.getCluster(), fieldNames, keyFields, partitionFields, logicalTypes, options); executor.prepareStatement(shardConnection); shardExecutors.add(executor); } batchCounts = new int[shardConnections.size()]; long flushIntervalMillis = options.getFlushInterval().toMillis(); scheduledFlush(flushIntervalMillis, "clickhouse-shard-output-format"); } catch (Exception exception) { throw new IOException("Unable to establish connection to ClickHouse", exception); } } /** * TODO: It's not appropriate to write records in this way, we should adapt it to ClickHouse's * data shard strategy. */ @Override public synchronized void writeRecord(RowData record) throws IOException { checkFlushException(); switch (record.getRowKind()) { case INSERT: case UPDATE_AFTER: writeRecordToOneExecutor(record); break; case DELETE: if (!ignoreDelete) { writeRecordToOneExecutor(record); } break; case UPDATE_BEFORE: break; default: throw new UnsupportedOperationException( String.format( "Unknown row kind, the supported row kinds is: INSERT, UPDATE_BEFORE, UPDATE_AFTER, DELETE, but get: %s.", record.getRowKind())); } } private void writeRecordToOneExecutor(RowData record) throws IOException { try { int selected = partitioner.select(record, shardExecutors.size()); shardExecutors.get(selected).addToBatch(record); batchCounts[selected]++; if (batchCounts[selected] >= options.getBatchSize()) { flush(selected); } } catch (Exception exception) { throw new IOException("Writing record to one executor failed.", exception); } } @Override public synchronized void flush() throws IOException { for (int i = 0; i < shardExecutors.size(); i++) { flush(i); } } private synchronized void flush(int index) throws IOException { if (batchCounts[index] > 0) { checkBeforeFlush(shardExecutors.get(index)); batchCounts[index] = 0; } } @Override public synchronized void closeOutputFormat() { for (ClickHouseExecutor shardExecutor : shardExecutors) { shardExecutor.closeStatement(); } connectionProvider.closeConnections(); } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/ClickHouseStatementFactory.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.internal; import org.apache.commons.lang3.ArrayUtils; import java.util.Arrays; import static java.lang.String.format; import static java.util.stream.Collectors.joining; /** Create an insert/update/delete ClickHouse statement. */ public class ClickHouseStatementFactory { private static final String EMPTY = ""; private ClickHouseStatementFactory() {} public static String getSelectStatement( String tableName, String databaseName, String[] fieldNames) { String columns = Arrays.stream(fieldNames) .map(ClickHouseStatementFactory::quoteIdentifier) .collect(joining(", ")); return String.join( EMPTY, "SELECT ", columns, " FROM ", fromTableClause(tableName, databaseName)); } public static String getInsertIntoStatement(String tableName, String[] fieldNames) { String columns = Arrays.stream(fieldNames) .map(ClickHouseStatementFactory::quoteIdentifier) .collect(joining(", ")); String placeholders = Arrays.stream(fieldNames).map((f) -> "?").collect(joining(", ")); return String.join( EMPTY, "INSERT INTO ", quoteIdentifier(tableName), "(", columns, ") VALUES (", placeholders, ")"); } public static String getUpdateStatement( String tableName, String databaseName, String clusterName, String[] fieldNames, String[] keyFields, String[] partitionFields) { String setClause = Arrays.stream(fieldNames) .filter(f -> !ArrayUtils.contains(keyFields, f)) .filter(f -> !ArrayUtils.contains(partitionFields, f)) .map((f) -> quoteIdentifier(f) + "=?") .collect(joining(", ")); String conditionClause = Arrays.stream(keyFields) .map((f) -> quoteIdentifier(f) + "=?") .collect(joining(" AND ")); String onClusterClause = ""; if (clusterName != null) { onClusterClause = " ON CLUSTER " + quoteIdentifier(clusterName); } return String.join( EMPTY, "ALTER TABLE ", fromTableClause(tableName, databaseName), onClusterClause, " UPDATE ", setClause, " WHERE ", conditionClause); } public static String getDeleteStatement( String tableName, String databaseName, String clusterName, String[] conditionFields) { String conditionClause = Arrays.stream(conditionFields) .map((f) -> quoteIdentifier(f) + "=?") .collect(joining(" AND ")); String onClusterClause = ""; if (clusterName != null) { onClusterClause = " ON CLUSTER " + quoteIdentifier(clusterName); } return String.join( EMPTY, "ALTER TABLE ", fromTableClause(tableName, databaseName), onClusterClause, " DELETE WHERE ", conditionClause); } private static String fromTableClause(String tableName, String databaseName) { if (databaseName == null) { return quoteIdentifier(tableName); } return format("%s.%s", quoteIdentifier(databaseName), quoteIdentifier(tableName)); } private static String quoteIdentifier(String identifier) { return String.join(EMPTY, "`", identifier, "`"); } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/common/DistributedEngineFullSchema.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.internal.common; import org.apache.flink.util.StringUtils; import java.io.Serializable; import static org.apache.flink.util.Preconditions.checkArgument; /** Distributed table engine full schema. */ public class DistributedEngineFullSchema implements Serializable { private final String cluster; private final String database; private final String table; private final String shardingKey; private final String policyName; public static DistributedEngineFullSchema of(String cluster, String database, String table) { return new DistributedEngineFullSchema(cluster, database, table); } public static DistributedEngineFullSchema of( String cluster, String database, String table, String shardingKey, String policyName) { return new DistributedEngineFullSchema(cluster, database, table, shardingKey, policyName); } private DistributedEngineFullSchema( String cluster, String database, String table, String shardingKey, String policyName) { checkArgument( !StringUtils.isNullOrWhitespaceOnly(cluster), "cluster cannot be null or empty"); checkArgument( !StringUtils.isNullOrWhitespaceOnly(database), "database cannot be null or empty"); checkArgument(!StringUtils.isNullOrWhitespaceOnly(table), "table cannot be null or empty"); this.cluster = cluster; this.database = database; this.table = table; this.shardingKey = shardingKey; this.policyName = policyName; } private DistributedEngineFullSchema(String cluster, String database, String table) { this(cluster, database, table, null, null); } public String getCluster() { return cluster; } public String getDatabase() { return database; } public String getTable() { return table; } public String getShardingKey() { return shardingKey; } public String getPolicyName() { return policyName; } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/connection/ClickHouseConnectionProvider.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.internal.connection; import org.apache.flink.connector.clickhouse.internal.options.ClickHouseConnectionOptions; import org.apache.flink.connector.clickhouse.util.ClickHouseUtil; import org.apache.http.HttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.utils.URIBuilder; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import ru.yandex.clickhouse.BalancedClickhouseDataSource; import ru.yandex.clickhouse.ClickHouseConnection; import ru.yandex.clickhouse.settings.ClickHouseProperties; import java.io.Serializable; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import java.util.Properties; import java.util.regex.Matcher; import java.util.regex.Pattern; /** ClickHouse connection provider. Use ClickHouseDriver to create a connection. */ public class ClickHouseConnectionProvider implements Serializable { private static final long serialVersionUID = 1L; private static final Logger LOG = LoggerFactory.getLogger(ClickHouseConnectionProvider.class); private static final Pattern HTTP_PORT_PATTERN = Pattern.compile("You must use port (?[0-9]+) for HTTP."); /** * Query different shard info * *

TODO: Should consider `shard_weight` when writing data into different shards, may be also * `replica_num`. */ private static final String QUERY_CLUSTER_INFO_SQL = "SELECT shard_num, host_address, port FROM system.clusters WHERE cluster = ? and replica_num = 1 ORDER BY shard_num ASC"; private final ClickHouseConnectionOptions options; private final Properties connectionProperties; private transient ClickHouseConnection connection; private transient List shardConnections; public ClickHouseConnectionProvider(ClickHouseConnectionOptions options) { this(options, new Properties()); } public ClickHouseConnectionProvider( ClickHouseConnectionOptions options, Properties connectionProperties) { this.options = options; this.connectionProperties = connectionProperties; } public synchronized ClickHouseConnection getOrCreateConnection() throws SQLException { if (connection == null) { connection = createConnection(options.getUrl(), options.getDatabaseName()); } return connection; } public synchronized List createShardConnections( String shardCluster, String shardDatabase) throws SQLException { List shardUrls = getShardUrls(shardCluster); if (shardUrls.isEmpty()) { throw new SQLException("Unable to query shards in system.clusters"); } List connections = new ArrayList<>(); for (String shardUrl : shardUrls) { ClickHouseConnection connection = createAndStoreShardConnection(shardUrl, shardDatabase); connections.add(connection); } return connections; } public synchronized ClickHouseConnection createAndStoreShardConnection( String url, String database) throws SQLException { if (shardConnections == null) { shardConnections = new ArrayList<>(); } ClickHouseConnection connection = createConnection(url, database); shardConnections.add(connection); return connection; } public List getShardUrls(String remoteCluster) throws SQLException { List urls = new ArrayList<>(); ClickHouseConnection conn = getOrCreateConnection(); try (PreparedStatement stmt = conn.prepareStatement(QUERY_CLUSTER_INFO_SQL)) { stmt.setString(1, remoteCluster); try (ResultSet rs = stmt.executeQuery()) { while (rs.next()) { // TODO: String host = rs.getString("host_address"); String host = rs.getString("host_address").replace("192.168.126.", "192.168.0."); int port = getActualHttpPort(host, rs.getInt("port")); urls.add("jdbc:clickhouse://" + host + ":" + port); } } } return urls; } private ClickHouseConnection createConnection(String url, String database) throws SQLException { LOG.info("connecting to {}, database {}", url, database); String jdbcUrl = ClickHouseUtil.getJdbcUrl(url, database); ClickHouseProperties properties = new ClickHouseProperties(connectionProperties); properties.setUser(options.getUsername().orElse(null)); properties.setPassword(options.getPassword().orElse(null)); BalancedClickhouseDataSource dataSource = new BalancedClickhouseDataSource(jdbcUrl, properties); if (dataSource.getAllClickhouseUrls().size() > 1) { dataSource.actualize(); } return dataSource.getConnection(); } public void closeConnections() { if (this.connection != null) { try { connection.close(); } catch (SQLException exception) { LOG.warn("ClickHouse connection could not be closed.", exception); } finally { connection = null; } } if (shardConnections != null) { for (ClickHouseConnection shardConnection : this.shardConnections) { try { shardConnection.close(); } catch (SQLException exception) { LOG.warn("ClickHouse shard connection could not be closed.", exception); } } shardConnections = null; } } private int getActualHttpPort(String host, int port) throws SQLException { try (CloseableHttpClient httpclient = HttpClients.createDefault()) { HttpGet request = new HttpGet( (new URIBuilder()) .setScheme("http") .setHost(host) .setPort(port) .build()); HttpResponse response = httpclient.execute(request); int statusCode = response.getStatusLine().getStatusCode(); if (statusCode != 200) { String raw = EntityUtils.toString(response.getEntity()); Matcher matcher = HTTP_PORT_PATTERN.matcher(raw); if (matcher.find()) { return Integer.parseInt(matcher.group("port")); } throw new SQLException("Cannot query ClickHouse http port."); } return port; } catch (Throwable throwable) { throw new SQLException("Cannot connect to ClickHouse server using HTTP.", throwable); } } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/converter/ClickHouseConverterUtils.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.internal.converter; import org.apache.flink.table.data.ArrayData; import org.apache.flink.table.data.DecimalData; import org.apache.flink.table.data.GenericArrayData; import org.apache.flink.table.data.GenericMapData; import org.apache.flink.table.data.MapData; import org.apache.flink.table.data.StringData; import org.apache.flink.table.data.TimestampData; import org.apache.flink.table.types.logical.ArrayType; import org.apache.flink.table.types.logical.DecimalType; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.MapType; import java.math.BigDecimal; import java.math.BigInteger; import java.sql.Array; import java.sql.Date; import java.sql.SQLException; import java.sql.Time; import java.sql.Timestamp; import java.time.LocalDate; import java.time.LocalTime; import java.util.HashMap; import java.util.Map; import static org.apache.flink.connector.clickhouse.util.ClickHouseUtil.toFixedDateTimestamp; /** convert between internal and external data types. */ public class ClickHouseConverterUtils { public static final int BOOL_TRUE = 1; public static Object toExternal(Object value, LogicalType type) { switch (type.getTypeRoot()) { case BOOLEAN: case TINYINT: case SMALLINT: case INTEGER: case INTERVAL_YEAR_MONTH: case BIGINT: case INTERVAL_DAY_TIME: case FLOAT: case DOUBLE: case BINARY: case VARBINARY: return value; case CHAR: case VARCHAR: return value.toString(); case DATE: return Date.valueOf(LocalDate.ofEpochDay((Integer) value)); case TIME_WITHOUT_TIME_ZONE: LocalTime localTime = LocalTime.ofNanoOfDay(((Integer) value) * 1_000_000L); return toFixedDateTimestamp(localTime); case TIMESTAMP_WITH_TIME_ZONE: case TIMESTAMP_WITHOUT_TIME_ZONE: return ((TimestampData) value).toTimestamp(); case TIMESTAMP_WITH_LOCAL_TIME_ZONE: return Timestamp.from(((TimestampData) value).toInstant()); case DECIMAL: return ((DecimalData) value).toBigDecimal(); case ARRAY: LogicalType elementType = ((ArrayType) type) .getChildren().stream() .findFirst() .orElseThrow( () -> new RuntimeException( "Unknown array element type")); ArrayData.ElementGetter elementGetter = ArrayData.createElementGetter(elementType); ArrayData arrayData = ((ArrayData) value); Object[] objectArray = new Object[arrayData.size()]; for (int i = 0; i < arrayData.size(); i++) { objectArray[i] = toExternal(elementGetter.getElementOrNull(arrayData, i), elementType); } return objectArray; case MAP: LogicalType keyType = ((MapType) type).getKeyType(); LogicalType valueType = ((MapType) type).getValueType(); ArrayData.ElementGetter keyGetter = ArrayData.createElementGetter(keyType); ArrayData.ElementGetter valueGetter = ArrayData.createElementGetter(valueType); MapData mapData = (MapData) value; ArrayData keyArrayData = mapData.keyArray(); ArrayData valueArrayData = mapData.valueArray(); Map objectMap = new HashMap<>(keyArrayData.size()); for (int i = 0; i < keyArrayData.size(); i++) { objectMap.put( toExternal(keyGetter.getElementOrNull(keyArrayData, i), keyType), toExternal(valueGetter.getElementOrNull(valueArrayData, i), valueType)); } return objectMap; case MULTISET: case ROW: case RAW: default: throw new UnsupportedOperationException("Unsupported type:" + type); } } public static Object toInternal(Object value, LogicalType type) throws SQLException { switch (type.getTypeRoot()) { case NULL: return null; case BOOLEAN: return BOOL_TRUE == ((Number) value).intValue(); case FLOAT: case DOUBLE: case INTERVAL_YEAR_MONTH: case INTERVAL_DAY_TIME: case INTEGER: case BIGINT: case BINARY: case VARBINARY: return value; case TINYINT: return ((Integer) value).byteValue(); case SMALLINT: return value instanceof Integer ? ((Integer) value).shortValue() : value; case DECIMAL: final int precision = ((DecimalType) type).getPrecision(); final int scale = ((DecimalType) type).getScale(); return value instanceof BigInteger ? DecimalData.fromBigDecimal( new BigDecimal((BigInteger) value, 0), precision, scale) : DecimalData.fromBigDecimal((BigDecimal) value, precision, scale); case DATE: return (int) (((Date) value).toLocalDate().toEpochDay()); case TIME_WITHOUT_TIME_ZONE: return (int) (((Time) value).toLocalTime().toNanoOfDay() / 1_000_000L); case TIMESTAMP_WITH_TIME_ZONE: case TIMESTAMP_WITHOUT_TIME_ZONE: return TimestampData.fromTimestamp((Timestamp) value); case TIMESTAMP_WITH_LOCAL_TIME_ZONE: return TimestampData.fromInstant(((Timestamp) value).toInstant()); case CHAR: case VARCHAR: return StringData.fromString((String) value); case ARRAY: LogicalType elementType = type.getChildren().stream() .findFirst() .orElseThrow( () -> new RuntimeException("Unknown array element type")); Object externalArray = ((Array) value).getArray(); int externalArrayLength = java.lang.reflect.Array.getLength(externalArray); Object[] internalArray = new Object[externalArrayLength]; for (int i = 0; i < externalArrayLength; i++) { internalArray[i] = toInternal(java.lang.reflect.Array.get(externalArray, i), elementType); } return new GenericArrayData(internalArray); case MAP: LogicalType keyType = ((MapType) type).getKeyType(); LogicalType valueType = ((MapType) type).getValueType(); Map externalMap = (Map) value; Map internalMap = new HashMap<>(externalMap.size()); for (Map.Entry entry : externalMap.entrySet()) { internalMap.put( toInternal(entry.getKey(), keyType), toInternal(entry.getValue(), valueType)); } return new GenericMapData(internalMap); case ROW: case MULTISET: case RAW: default: throw new UnsupportedOperationException("Unsupported type:" + type); } } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/converter/ClickHouseRowConverter.java ================================================ // // Source code recreated from a .class file by IntelliJ IDEA // (powered by FernFlower decompiler) // package org.apache.flink.connector.clickhouse.internal.converter; import org.apache.flink.table.data.DecimalData; import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; import org.apache.flink.table.data.StringData; import org.apache.flink.table.data.TimestampData; import org.apache.flink.table.types.logical.DecimalType; import org.apache.flink.table.types.logical.LocalZonedTimestampType; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.RowType; import org.apache.flink.table.types.logical.RowType.RowField; import org.apache.flink.table.types.logical.TimestampType; import org.apache.flink.util.Preconditions; import ru.yandex.clickhouse.ClickHousePreparedStatement; import ru.yandex.clickhouse.response.ClickHouseResultSet; import java.io.Serializable; import java.math.BigDecimal; import java.math.BigInteger; import java.sql.Date; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Time; import java.sql.Timestamp; import java.time.LocalDate; import java.time.LocalTime; import static org.apache.flink.connector.clickhouse.internal.converter.ClickHouseConverterUtils.BOOL_TRUE; import static org.apache.flink.connector.clickhouse.util.ClickHouseUtil.toFixedDateTimestamp; /** Row converter,convert flink type to/from ClickHouse type. */ public class ClickHouseRowConverter implements Serializable { private static final long serialVersionUID = 1L; private final RowType rowType; private final DeserializationConverter[] toInternalConverters; private final SerializationConverter[] toExternalConverters; public ClickHouseRowConverter(RowType rowType) { this.rowType = Preconditions.checkNotNull(rowType); LogicalType[] logicalTypes = rowType.getFields().stream().map(RowField::getType).toArray(LogicalType[]::new); this.toInternalConverters = new DeserializationConverter[rowType.getFieldCount()]; this.toExternalConverters = new SerializationConverter[rowType.getFieldCount()]; for (int i = 0; i < rowType.getFieldCount(); i++) { this.toInternalConverters[i] = createToInternalConverter(rowType.getTypeAt(i)); this.toExternalConverters[i] = createToExternalConverter(logicalTypes[i]); } } public RowData toInternal(ResultSet resultSet) throws SQLException { GenericRowData genericRowData = new GenericRowData(rowType.getFieldCount()); for (int pos = 0; pos < rowType.getFieldCount(); pos++) { Object field = resultSet.getObject(pos + 1); if (field != null) { genericRowData.setField(pos, toInternalConverters[pos].deserialize(field)); } else { genericRowData.setField(pos, null); } } return genericRowData; } public void toExternal(RowData rowData, ClickHousePreparedStatement statement) throws SQLException { for (int index = 0; index < rowData.getArity(); index++) { if (!rowData.isNullAt(index)) { toExternalConverters[index].serialize(rowData, index, statement); } else { statement.setObject(index + 1, null); } } } protected ClickHouseRowConverter.DeserializationConverter createToInternalConverter( LogicalType type) { switch (type.getTypeRoot()) { case NULL: return val -> null; case BOOLEAN: return val -> BOOL_TRUE == ((Number) val).intValue(); case FLOAT: case DOUBLE: case INTERVAL_YEAR_MONTH: case INTERVAL_DAY_TIME: case INTEGER: case BIGINT: case BINARY: case VARBINARY: return val -> val; case TINYINT: return val -> ((Integer) val).byteValue(); case SMALLINT: return val -> val instanceof Integer ? ((Integer) val).shortValue() : val; case DECIMAL: final int precision = ((DecimalType) type).getPrecision(); final int scale = ((DecimalType) type).getScale(); return val -> val instanceof BigInteger ? DecimalData.fromBigDecimal( new BigDecimal((BigInteger) val, 0), precision, scale) : DecimalData.fromBigDecimal((BigDecimal) val, precision, scale); case DATE: return val -> (int) ((Date) val).toLocalDate().toEpochDay(); case TIME_WITHOUT_TIME_ZONE: return val -> (int) (((Time) val).toLocalTime().toNanoOfDay() / 1_000_000L); case TIMESTAMP_WITH_TIME_ZONE: case TIMESTAMP_WITHOUT_TIME_ZONE: return val -> TimestampData.fromTimestamp((Timestamp) val); case TIMESTAMP_WITH_LOCAL_TIME_ZONE: return val -> TimestampData.fromInstant(((Timestamp) val).toInstant()); case CHAR: case VARCHAR: return val -> StringData.fromString((String) val); case ARRAY: case MAP: return val -> ClickHouseConverterUtils.toInternal(val, type); case ROW: case MULTISET: case RAW: default: throw new UnsupportedOperationException("Unsupported type:" + type); } } protected ClickHouseRowConverter.SerializationConverter createToExternalConverter( LogicalType type) { switch (type.getTypeRoot()) { case BOOLEAN: return (val, index, statement) -> statement.setBoolean(index + 1, val.getBoolean(index)); case FLOAT: return (val, index, statement) -> statement.setFloat(index + 1, val.getFloat(index)); case DOUBLE: return (val, index, statement) -> statement.setDouble(index + 1, val.getDouble(index)); case INTERVAL_YEAR_MONTH: case INTEGER: return (val, index, statement) -> statement.setInt(index + 1, val.getInt(index)); case INTERVAL_DAY_TIME: case BIGINT: return (val, index, statement) -> statement.setLong(index + 1, val.getLong(index)); case TINYINT: return (val, index, statement) -> statement.setByte(index + 1, val.getByte(index)); case SMALLINT: return (val, index, statement) -> statement.setShort(index + 1, val.getShort(index)); case CHAR: case VARCHAR: // value is BinaryString return (val, index, statement) -> statement.setString(index + 1, val.getString(index).toString()); case BINARY: case VARBINARY: return (val, index, statement) -> statement.setBytes(index + 1, val.getBinary(index)); case DATE: return (val, index, statement) -> statement.setDate( index + 1, Date.valueOf(LocalDate.ofEpochDay(val.getInt(index)))); case TIME_WITHOUT_TIME_ZONE: return (val, index, statement) -> { LocalTime localTime = LocalTime.ofNanoOfDay(val.getInt(index) * 1_000_000L); statement.setTimestamp(index + 1, toFixedDateTimestamp(localTime)); }; case TIMESTAMP_WITH_TIME_ZONE: case TIMESTAMP_WITHOUT_TIME_ZONE: final int timestampPrecision = ((TimestampType) type).getPrecision(); return (val, index, statement) -> statement.setTimestamp( index + 1, val.getTimestamp(index, timestampPrecision).toTimestamp()); case TIMESTAMP_WITH_LOCAL_TIME_ZONE: final int localZonedTimestampPrecision = ((LocalZonedTimestampType) type).getPrecision(); return (val, index, statement) -> statement.setTimestamp( index + 1, Timestamp.from( val.getTimestamp(index, localZonedTimestampPrecision) .toInstant())); case DECIMAL: final int decimalPrecision = ((DecimalType) type).getPrecision(); final int decimalScale = ((DecimalType) type).getScale(); return (val, index, statement) -> statement.setBigDecimal( index + 1, val.getDecimal(index, decimalPrecision, decimalScale) .toBigDecimal()); case ARRAY: return (val, index, statement) -> statement.setArray( index + 1, (Object[]) ClickHouseConverterUtils.toExternal( val.getArray(index), type)); case MAP: return (val, index, statement) -> statement.setObject( index + 1, ClickHouseConverterUtils.toExternal(val.getMap(index), type)); case MULTISET: case ROW: case RAW: default: throw new UnsupportedOperationException("Unsupported type:" + type); } } @FunctionalInterface interface SerializationConverter extends Serializable { /** * Convert a internal field to to java object and fill into the {@link * ClickHousePreparedStatement}. */ void serialize(RowData rowData, int index, ClickHousePreparedStatement statement) throws SQLException; } @FunctionalInterface interface DeserializationConverter extends Serializable { /** * Convert a object of {@link ClickHouseResultSet} to the internal data structure object. */ Object deserialize(Object field) throws SQLException; } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/executor/ClickHouseBatchExecutor.java ================================================ // // Source code recreated from a .class file by IntelliJ IDEA // (powered by FernFlower decompiler) // package org.apache.flink.connector.clickhouse.internal.executor; import org.apache.flink.api.common.functions.RuntimeContext; import org.apache.flink.connector.clickhouse.internal.ClickHouseShardOutputFormat; import org.apache.flink.connector.clickhouse.internal.connection.ClickHouseConnectionProvider; import org.apache.flink.connector.clickhouse.internal.converter.ClickHouseRowConverter; import org.apache.flink.connector.clickhouse.internal.options.ClickHouseDmlOptions; import org.apache.flink.table.data.RowData; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import ru.yandex.clickhouse.ClickHouseConnection; import ru.yandex.clickhouse.ClickHousePreparedStatement; import java.sql.SQLException; /** ClickHouse's batch executor. */ public class ClickHouseBatchExecutor implements ClickHouseExecutor { private static final long serialVersionUID = 1L; private static final Logger LOG = LoggerFactory.getLogger(ClickHouseShardOutputFormat.class); private final String insertSql; private final ClickHouseRowConverter converter; private final int maxRetries; private transient ClickHousePreparedStatement statement; private transient ClickHouseConnectionProvider connectionProvider; public ClickHouseBatchExecutor( String insertSql, ClickHouseRowConverter converter, ClickHouseDmlOptions options) { this.insertSql = insertSql; this.converter = converter; this.maxRetries = options.getMaxRetries(); } @Override public void prepareStatement(ClickHouseConnection connection) throws SQLException { statement = (ClickHousePreparedStatement) connection.prepareStatement(insertSql); } @Override public void prepareStatement(ClickHouseConnectionProvider connectionProvider) throws SQLException { this.connectionProvider = connectionProvider; prepareStatement(connectionProvider.getOrCreateConnection()); } @Override public void setRuntimeContext(RuntimeContext context) {} @Override public void addToBatch(RowData record) throws SQLException { switch (record.getRowKind()) { case INSERT: converter.toExternal(record, statement); statement.addBatch(); break; case UPDATE_AFTER: case DELETE: case UPDATE_BEFORE: break; default: throw new UnsupportedOperationException( String.format( "Unknown row kind, the supported row kinds is: INSERT, UPDATE_BEFORE, UPDATE_AFTER, DELETE, but get: %s.", record.getRowKind())); } } @Override public void executeBatch() throws SQLException { attemptExecuteBatch(statement, maxRetries); } @Override public void closeStatement() { if (statement != null) { try { statement.close(); } catch (SQLException exception) { LOG.warn("ClickHouse batch statement could not be closed.", exception); } finally { statement = null; } } } @Override public String toString() { return "ClickHouseBatchExecutor{" + "insertSql='" + insertSql + '\'' + ", maxRetries=" + maxRetries + ", connectionProvider=" + connectionProvider + '}'; } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/executor/ClickHouseExecutor.java ================================================ // // Source code recreated from a .class file by IntelliJ IDEA // (powered by FernFlower decompiler) // package org.apache.flink.connector.clickhouse.internal.executor; import org.apache.flink.api.common.functions.RuntimeContext; import org.apache.flink.connector.clickhouse.internal.ClickHouseStatementFactory; import org.apache.flink.connector.clickhouse.internal.connection.ClickHouseConnectionProvider; import org.apache.flink.connector.clickhouse.internal.converter.ClickHouseRowConverter; import org.apache.flink.connector.clickhouse.internal.options.ClickHouseDmlOptions; import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.RowType; import org.apache.commons.lang3.ArrayUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import ru.yandex.clickhouse.ClickHouseConnection; import ru.yandex.clickhouse.ClickHousePreparedStatement; import java.io.Serializable; import java.sql.SQLException; import java.util.Arrays; import java.util.function.Function; import java.util.stream.IntStream; import static org.apache.flink.table.data.RowData.createFieldGetter; /** Executor interface for submitting data to ClickHouse. */ public interface ClickHouseExecutor extends Serializable { Logger LOG = LoggerFactory.getLogger(ClickHouseExecutor.class); void prepareStatement(ClickHouseConnection connection) throws SQLException; void prepareStatement(ClickHouseConnectionProvider connectionProvider) throws SQLException; void setRuntimeContext(RuntimeContext context); void addToBatch(RowData rowData) throws SQLException; void executeBatch() throws SQLException; void closeStatement(); default void attemptExecuteBatch(ClickHousePreparedStatement stmt, int maxRetries) throws SQLException { for (int i = 0; i < maxRetries; i++) { try { stmt.executeBatch(); return; } catch (Exception exception) { LOG.error("ClickHouse executeBatch error, retry times = {}", i, exception); try { Thread.sleep(1000L * i); } catch (InterruptedException ex) { Thread.currentThread().interrupt(); throw new SQLException( "Unable to flush; interrupted while doing another attempt", ex); } } } throw new SQLException( String.format( "Attempt to execute batch failed, exhausted retry times = %d", maxRetries)); } static ClickHouseExecutor createClickHouseExecutor( String tableName, String databaseName, String clusterName, String[] fieldNames, String[] keyFields, String[] partitionFields, LogicalType[] fieldTypes, ClickHouseDmlOptions options) { if (keyFields.length > 0) { return createUpsertExecutor( tableName, databaseName, clusterName, fieldNames, keyFields, partitionFields, fieldTypes, options); } else { return createBatchExecutor(tableName, fieldNames, fieldTypes, options); } } static ClickHouseBatchExecutor createBatchExecutor( String tableName, String[] fieldNames, LogicalType[] fieldTypes, ClickHouseDmlOptions options) { String insertSql = ClickHouseStatementFactory.getInsertIntoStatement(tableName, fieldNames); ClickHouseRowConverter converter = new ClickHouseRowConverter(RowType.of(fieldTypes)); return new ClickHouseBatchExecutor(insertSql, converter, options); } static ClickHouseUpsertExecutor createUpsertExecutor( String tableName, String databaseName, String clusterName, String[] fieldNames, String[] keyFields, String[] partitionFields, LogicalType[] fieldTypes, ClickHouseDmlOptions options) { String insertSql = ClickHouseStatementFactory.getInsertIntoStatement(tableName, fieldNames); String updateSql = ClickHouseStatementFactory.getUpdateStatement( tableName, databaseName, clusterName, fieldNames, keyFields, partitionFields); String deleteSql = ClickHouseStatementFactory.getDeleteStatement( tableName, databaseName, clusterName, keyFields); // Re-sort the order of fields to fit the sql statement. int[] delFields = Arrays.stream(keyFields) .mapToInt(pk -> ArrayUtils.indexOf(fieldNames, pk)) .toArray(); int[] updatableFields = IntStream.range(0, fieldNames.length) .filter(idx -> !ArrayUtils.contains(keyFields, fieldNames[idx])) .filter(idx -> !ArrayUtils.contains(partitionFields, fieldNames[idx])) .toArray(); int[] updFields = ArrayUtils.addAll(updatableFields, delFields); LogicalType[] delTypes = Arrays.stream(delFields).mapToObj(f -> fieldTypes[f]).toArray(LogicalType[]::new); LogicalType[] updTypes = Arrays.stream(updFields).mapToObj(f -> fieldTypes[f]).toArray(LogicalType[]::new); return new ClickHouseUpsertExecutor( insertSql, updateSql, deleteSql, new ClickHouseRowConverter(RowType.of(fieldTypes)), new ClickHouseRowConverter(RowType.of(updTypes)), new ClickHouseRowConverter(RowType.of(delTypes)), createExtractor(fieldTypes, updFields), createExtractor(fieldTypes, delFields), options); } static Function createExtractor(LogicalType[] logicalTypes, int[] fields) { final RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[fields.length]; for (int i = 0; i < fields.length; i++) { fieldGetters[i] = createFieldGetter(logicalTypes[fields[i]], fields[i]); } return row -> { GenericRowData rowData = new GenericRowData(row.getRowKind(), fieldGetters.length); for (int i = 0; i < fieldGetters.length; i++) { rowData.setField(i, fieldGetters[i].getFieldOrNull(row)); } return rowData; }; } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/executor/ClickHouseUpsertExecutor.java ================================================ // // Source code recreated from a .class file by IntelliJ IDEA // (powered by FernFlower decompiler) // package org.apache.flink.connector.clickhouse.internal.executor; import org.apache.flink.api.common.functions.RuntimeContext; import org.apache.flink.connector.clickhouse.internal.ClickHouseShardOutputFormat; import org.apache.flink.connector.clickhouse.internal.connection.ClickHouseConnectionProvider; import org.apache.flink.connector.clickhouse.internal.converter.ClickHouseRowConverter; import org.apache.flink.connector.clickhouse.internal.options.ClickHouseDmlOptions; import org.apache.flink.table.data.RowData; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import ru.yandex.clickhouse.ClickHouseConnection; import ru.yandex.clickhouse.ClickHousePreparedStatement; import java.sql.SQLException; import java.util.Arrays; import java.util.function.Function; /** ClickHouse's upsert executor. */ public class ClickHouseUpsertExecutor implements ClickHouseExecutor { private static final long serialVersionUID = 1L; private static final Logger LOG = LoggerFactory.getLogger(ClickHouseShardOutputFormat.class); private final String insertSql; private final String updateSql; private final String deleteSql; private final ClickHouseRowConverter insertConverter; private final ClickHouseRowConverter updateConverter; private final ClickHouseRowConverter deleteConverter; private final Function updateExtractor; private final Function deleteExtractor; private final int maxRetries; private transient ClickHousePreparedStatement insertStmt; private transient ClickHousePreparedStatement updateStmt; private transient ClickHousePreparedStatement deleteStmt; private transient ClickHouseConnectionProvider connectionProvider; public ClickHouseUpsertExecutor( String insertSql, String updateSql, String deleteSql, ClickHouseRowConverter insertConverter, ClickHouseRowConverter updateConverter, ClickHouseRowConverter deleteConverter, Function updateExtractor, Function deleteExtractor, ClickHouseDmlOptions options) { this.insertSql = insertSql; this.updateSql = updateSql; this.deleteSql = deleteSql; this.insertConverter = insertConverter; this.updateConverter = updateConverter; this.deleteConverter = deleteConverter; this.updateExtractor = updateExtractor; this.deleteExtractor = deleteExtractor; this.maxRetries = options.getMaxRetries(); } @Override public void prepareStatement(ClickHouseConnection connection) throws SQLException { this.insertStmt = (ClickHousePreparedStatement) connection.prepareStatement(this.insertSql); this.updateStmt = (ClickHousePreparedStatement) connection.prepareStatement(this.updateSql); this.deleteStmt = (ClickHousePreparedStatement) connection.prepareStatement(this.deleteSql); } @Override public void prepareStatement(ClickHouseConnectionProvider connectionProvider) throws SQLException { this.connectionProvider = connectionProvider; prepareStatement(connectionProvider.getOrCreateConnection()); } @Override public void setRuntimeContext(RuntimeContext context) {} @Override public void addToBatch(RowData record) throws SQLException { switch (record.getRowKind()) { case INSERT: insertConverter.toExternal(record, insertStmt); insertStmt.addBatch(); break; case UPDATE_AFTER: updateConverter.toExternal(updateExtractor.apply(record), updateStmt); updateStmt.addBatch(); break; case DELETE: deleteConverter.toExternal(deleteExtractor.apply(record), deleteStmt); deleteStmt.addBatch(); break; case UPDATE_BEFORE: break; default: throw new UnsupportedOperationException( String.format( "Unknown row kind, the supported row kinds is: INSERT, UPDATE_BEFORE, UPDATE_AFTER, DELETE, but get: %s.", record.getRowKind())); } } @Override public void executeBatch() throws SQLException { for (ClickHousePreparedStatement clickHousePreparedStatement : Arrays.asList(insertStmt, updateStmt, deleteStmt)) { if (clickHousePreparedStatement != null) { attemptExecuteBatch(clickHousePreparedStatement, maxRetries); } } } @Override public void closeStatement() { for (ClickHousePreparedStatement clickHousePreparedStatement : Arrays.asList(insertStmt, updateStmt, deleteStmt)) { if (clickHousePreparedStatement != null) { try { clickHousePreparedStatement.close(); } catch (SQLException exception) { LOG.warn("ClickHouse upsert statement could not be closed.", exception); } } } } @Override public String toString() { return "ClickHouseUpsertExecutor{" + "insertSql='" + insertSql + '\'' + ", updateSql='" + updateSql + '\'' + ", deleteSql='" + deleteSql + '\'' + ", maxRetries=" + maxRetries + ", connectionProvider=" + connectionProvider + '}'; } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/options/ClickHouseConnectionOptions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.internal.options; import javax.annotation.Nullable; import java.io.Serializable; import java.util.Optional; /** ClickHouse connection options. */ public class ClickHouseConnectionOptions implements Serializable { private static final long serialVersionUID = 1L; private final String url; private final String username; private final String password; private final String databaseName; private final String tableName; protected ClickHouseConnectionOptions( String url, @Nullable String username, @Nullable String password, String databaseName, String tableName) { this.url = url; this.username = username; this.password = password; this.databaseName = databaseName; this.tableName = tableName; } public String getUrl() { return this.url; } public Optional getUsername() { return Optional.ofNullable(this.username); } public Optional getPassword() { return Optional.ofNullable(this.password); } public String getDatabaseName() { return this.databaseName; } public String getTableName() { return this.tableName; } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/options/ClickHouseDmlOptions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.internal.options; import javax.annotation.Nullable; import java.time.Duration; /** ClickHouse data modify language options. */ public class ClickHouseDmlOptions extends ClickHouseConnectionOptions { private static final long serialVersionUID = 1L; private final int batchSize; private final Duration flushInterval; private final int maxRetries; private final boolean useLocal; private final String partitionStrategy; private final String partitionKey; private final boolean ignoreDelete; private ClickHouseDmlOptions( String url, @Nullable String username, @Nullable String password, String databaseName, String tableName, int batchSize, Duration flushInterval, int maxRetires, boolean useLocal, String partitionStrategy, String partitionKey, boolean ignoreDelete) { super(url, username, password, databaseName, tableName); this.batchSize = batchSize; this.flushInterval = flushInterval; this.maxRetries = maxRetires; this.useLocal = useLocal; this.partitionStrategy = partitionStrategy; this.partitionKey = partitionKey; this.ignoreDelete = ignoreDelete; } public int getBatchSize() { return this.batchSize; } public Duration getFlushInterval() { return this.flushInterval; } public int getMaxRetries() { return this.maxRetries; } public boolean isUseLocal() { return this.useLocal; } public String getPartitionStrategy() { return this.partitionStrategy; } public String getPartitionKey() { return this.partitionKey; } public boolean getIgnoreDelete() { return this.ignoreDelete; } /** Builder for {@link ClickHouseDmlOptions}. */ public static class Builder { private String url; private String username; private String password; private String databaseName; private String tableName; private int batchSize; private Duration flushInterval; private int maxRetries; private boolean writeLocal; private boolean useLocal; private String partitionStrategy; private String partitionKey; private boolean ignoreDelete; public Builder() {} public ClickHouseDmlOptions.Builder withUrl(String url) { this.url = url; return this; } public ClickHouseDmlOptions.Builder withUsername(String username) { this.username = username; return this; } public ClickHouseDmlOptions.Builder withPassword(String password) { this.password = password; return this; } public ClickHouseDmlOptions.Builder withDatabaseName(String databaseName) { this.databaseName = databaseName; return this; } public ClickHouseDmlOptions.Builder withTableName(String tableName) { this.tableName = tableName; return this; } public ClickHouseDmlOptions.Builder withBatchSize(int batchSize) { this.batchSize = batchSize; return this; } public ClickHouseDmlOptions.Builder withFlushInterval(Duration flushInterval) { this.flushInterval = flushInterval; return this; } public ClickHouseDmlOptions.Builder withMaxRetries(int maxRetries) { this.maxRetries = maxRetries; return this; } public ClickHouseDmlOptions.Builder withWriteLocal(Boolean writeLocal) { this.writeLocal = writeLocal; return this; } public ClickHouseDmlOptions.Builder withUseLocal(Boolean useLocal) { this.useLocal = useLocal; return this; } public ClickHouseDmlOptions.Builder withPartitionStrategy(String partitionStrategy) { this.partitionStrategy = partitionStrategy; return this; } public ClickHouseDmlOptions.Builder withPartitionKey(String partitionKey) { this.partitionKey = partitionKey; return this; } public ClickHouseDmlOptions.Builder withIgnoreDelete(boolean ignoreDelete) { this.ignoreDelete = ignoreDelete; return this; } public ClickHouseDmlOptions build() { return new ClickHouseDmlOptions( url, username, password, databaseName, tableName, batchSize, flushInterval, maxRetries, writeLocal || useLocal, partitionStrategy, partitionKey, ignoreDelete); } } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/options/ClickHouseReadOptions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.internal.options; import javax.annotation.Nullable; /** ClickHouse read options. */ public class ClickHouseReadOptions extends ClickHouseConnectionOptions { private static final long serialVersionUID = 1L; private final boolean useLocal; private final String partitionColumn; private final Integer partitionNum; private final Long partitionLowerBound; private final Long partitionUpperBound; private ClickHouseReadOptions( String url, @Nullable String username, @Nullable String password, String databaseName, String tableName, boolean useLocal, String partitionColumn, Integer partitionNum, Long partitionLowerBound, Long partitionUpperBound) { super(url, username, password, databaseName, tableName); this.useLocal = useLocal; this.partitionColumn = partitionColumn; this.partitionNum = partitionNum; this.partitionLowerBound = partitionLowerBound; this.partitionUpperBound = partitionUpperBound; } public boolean isUseLocal() { return useLocal; } public String getPartitionColumn() { return partitionColumn; } public Integer getPartitionNum() { return partitionNum; } public Long getPartitionLowerBound() { return partitionLowerBound; } public Long getPartitionUpperBound() { return partitionUpperBound; } /** Builder for {@link ClickHouseReadOptions}. */ public static class Builder { private String url; private String username; private String password; private String databaseName; private String tableName; private boolean useLocal; private String partitionColumn; private Integer partitionNum; private Long partitionLowerBound; private Long partitionUpperBound; public ClickHouseReadOptions.Builder withUrl(String url) { this.url = url; return this; } public ClickHouseReadOptions.Builder withUsername(String username) { this.username = username; return this; } public ClickHouseReadOptions.Builder withPassword(String password) { this.password = password; return this; } public ClickHouseReadOptions.Builder withDatabaseName(String databaseName) { this.databaseName = databaseName; return this; } public ClickHouseReadOptions.Builder withTableName(String tableName) { this.tableName = tableName; return this; } public ClickHouseReadOptions.Builder withUseLocal(boolean useLocal) { this.useLocal = useLocal; return this; } public ClickHouseReadOptions.Builder withPartitionColumn(String partitionColumn) { this.partitionColumn = partitionColumn; return this; } public ClickHouseReadOptions.Builder withPartitionNum(Integer partitionNum) { this.partitionNum = partitionNum; return this; } public Builder withPartitionLowerBound(Long partitionLowerBound) { this.partitionLowerBound = partitionLowerBound; return this; } public Builder withPartitionUpperBound(Long partitionUpperBound) { this.partitionUpperBound = partitionUpperBound; return this; } public ClickHouseReadOptions build() { return new ClickHouseReadOptions( url, username, password, databaseName, tableName, useLocal, partitionColumn, partitionNum, partitionLowerBound, partitionUpperBound); } } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/partitioner/BalancedPartitioner.java ================================================ // // Source code recreated from a .class file by IntelliJ IDEA // (powered by FernFlower decompiler) // package org.apache.flink.connector.clickhouse.internal.partitioner; import org.apache.flink.table.data.RowData; /** Use round-robin mode to partition data. */ public class BalancedPartitioner implements ClickHousePartitioner { private static final long serialVersionUID = 1L; private int nextShard = 0; public BalancedPartitioner() {} @Override public int select(RowData record, int numShards) { nextShard = (nextShard + 1) % numShards; return nextShard; } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/partitioner/ClickHousePartitioner.java ================================================ // // Source code recreated from a .class file by IntelliJ IDEA // (powered by FernFlower decompiler) // package org.apache.flink.connector.clickhouse.internal.partitioner; import org.apache.flink.table.data.RowData; import org.apache.flink.table.data.RowData.FieldGetter; import java.io.Serializable; /** ClickHouse data partitioner interface. */ public interface ClickHousePartitioner extends Serializable { String BALANCED = "balanced"; String SHUFFLE = "shuffle"; String HASH = "hash"; int select(RowData record, int numShards); static ClickHousePartitioner createBalanced() { return new BalancedPartitioner(); } static ClickHousePartitioner createShuffle() { return new ShufflePartitioner(); } static ClickHousePartitioner createHash(FieldGetter getter) { return new HashPartitioner(getter); } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/partitioner/HashPartitioner.java ================================================ // // Source code recreated from a .class file by IntelliJ IDEA // (powered by FernFlower decompiler) // package org.apache.flink.connector.clickhouse.internal.partitioner; import org.apache.flink.table.data.RowData; import org.apache.flink.table.data.RowData.FieldGetter; import java.util.Objects; /** Use primary-key's hash code to partition data. */ public class HashPartitioner implements ClickHousePartitioner { private static final long serialVersionUID = 1L; private final FieldGetter getter; public HashPartitioner(FieldGetter getter) { this.getter = getter; } @Override public int select(RowData record, int numShards) { return Math.abs(Objects.hashCode(getter.getFieldOrNull(record)) % numShards); } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/internal/partitioner/ShufflePartitioner.java ================================================ // // Source code recreated from a .class file by IntelliJ IDEA // (powered by FernFlower decompiler) // package org.apache.flink.connector.clickhouse.internal.partitioner; import org.apache.flink.table.data.RowData; import java.util.concurrent.ThreadLocalRandom; /** Shuffle data by random numbers. */ public class ShufflePartitioner implements ClickHousePartitioner { private static final long serialVersionUID = 1L; public ShufflePartitioner() {} @Override public int select(RowData record, int numShards) { return ThreadLocalRandom.current().nextInt(numShards); } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/split/ClickHouseBatchBetweenParametersProvider.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.split; import static org.apache.flink.util.Preconditions.checkArgument; /** For example, $columnName BETWEEN ? AND ? */ public class ClickHouseBatchBetweenParametersProvider extends ClickHouseBetweenParametersProvider { public ClickHouseBatchBetweenParametersProvider(long minVal, long maxVal) { super(minVal, maxVal); } @Override public ClickHouseBatchBetweenParametersProvider ofBatchNum(Integer batchNum) { checkArgument(batchNum != null && batchNum > 0, "Batch number must be positive"); long maxElemCount = (maxVal - minVal) + 1; if (batchNum > maxElemCount) { batchNum = (int) maxElemCount; } this.batchNum = batchNum; return this; } @Override public ClickHouseBatchBetweenParametersProvider calculate() { this.parameterValues = divideParameterValues(batchNum); return this; } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/split/ClickHouseBetweenParametersProvider.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.split; import java.io.Serializable; import static org.apache.flink.util.Preconditions.checkArgument; import static org.apache.flink.util.Preconditions.checkState; /** This class is used to compute the list of parallel query to run (i.e. splits). */ public abstract class ClickHouseBetweenParametersProvider extends ClickHouseParametersProvider { private static final String BETWEEN_CLAUSE = "`%s` BETWEEN ? AND ?"; protected final long minVal; protected final long maxVal; public ClickHouseBetweenParametersProvider(long minVal, long maxVal) { checkArgument(maxVal >= minVal, "maxVal must be larger than minVal"); this.minVal = minVal; this.maxVal = maxVal; } @Override public String getParameterClause() { return BETWEEN_CLAUSE; } protected Serializable[][] divideParameterValues(int batchNum) { long maxElemCount = (maxVal - minVal) + 1; long batchSize = new Double(Math.ceil((double) maxElemCount / batchNum)).longValue(); long bigBatchNum = maxElemCount - (batchSize - 1) * batchNum; checkState(batchSize > 0, "Batch size and batch number must be positive."); Serializable[][] parameters = new Serializable[batchNum][2]; long start = minVal; for (int i = 0; i < batchNum; i++) { long end = start + batchSize - 1 - (i >= bigBatchNum ? 1 : 0); parameters[i] = new Long[] {start, end}; start = end + 1; } return parameters; } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/split/ClickHouseParametersProvider.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.split; import java.io.Serializable; /** Clickhouse parameters provider. */ public abstract class ClickHouseParametersProvider { protected Serializable[][] parameterValues; protected Serializable[][] shardIdValues; protected int batchNum; /** Returns the necessary parameters array to use for query in parallel a table. */ public Serializable[][] getParameterValues() { return parameterValues; } /** Returns the shard ids that the parameter values act on. */ public Serializable[][] getShardIdValues() { return shardIdValues; } public abstract String getParameterClause(); public abstract ClickHouseParametersProvider ofBatchNum(Integer batchNum); public abstract ClickHouseParametersProvider calculate(); // -------------------------- Methods for local tables -------------------------- protected int[] allocateShards(int minBatchSize, int minBatchNum, int length) { int[] shards = new int[length]; for (int i = 0; i < length; i++) { if (i + 1 <= minBatchNum) { shards[i] = minBatchSize; } else { shards[i] = minBatchSize + 1; } } return shards; } protected Integer[] subShardIds(int start, int idNum, int[] shardIds) { Integer[] subIds = new Integer[idNum]; for (int i = 0; i < subIds.length; i++) { subIds[i] = shardIds[start + i]; } return subIds; } /** Builder. */ public static class Builder { private Long minVal; private Long maxVal; private Integer batchNum; private int[] shardIds; private boolean useLocal; public Builder setMinVal(Long minVal) { this.minVal = minVal; return this; } public Builder setMaxVal(Long maxVal) { this.maxVal = maxVal; return this; } public Builder setBatchNum(Integer batchNum) { this.batchNum = batchNum; return this; } public Builder setShardIds(int[] shardIds) { this.shardIds = shardIds; return this; } public Builder setUseLocal(boolean useLocal) { this.useLocal = useLocal; return this; } public ClickHouseParametersProvider build() { ClickHouseParametersProvider parametersProvider = null; if (minVal == null || maxVal == null) { if (useLocal) { parametersProvider = new ClickHouseShardTableParametersProvider(shardIds); } else { throw new RuntimeException("No suitable ClickHouseParametersProvider found."); } } if (parametersProvider == null) { parametersProvider = useLocal && shardIds != null ? new ClickHouseShardBetweenParametersProvider( minVal, maxVal, shardIds) : new ClickHouseBatchBetweenParametersProvider(minVal, maxVal); } return parametersProvider.ofBatchNum(batchNum).calculate(); } } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/split/ClickHouseShardBetweenParametersProvider.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.split; import org.apache.flink.annotation.Experimental; import org.apache.commons.lang3.ArrayUtils; import java.io.Serializable; import java.util.Arrays; import static org.apache.flink.util.Preconditions.checkArgument; /** For example, $columnName BETWEEN ? AND ? */ @Experimental public class ClickHouseShardBetweenParametersProvider extends ClickHouseBetweenParametersProvider { private final int[] shardIds; private final int shardNum; public ClickHouseShardBetweenParametersProvider(long minVal, long maxVal, int[] shardIds) { super(minVal, maxVal); checkArgument(shardIds.length > 1, "length of shardIds must be larger than 0"); this.shardIds = shardIds; this.shardNum = shardIds.length; } @Override public ClickHouseShardBetweenParametersProvider ofBatchNum(Integer batchNum) { checkArgument(batchNum != null && batchNum > 0, "batchNum must be positive"); long maxElemCount = Math.max(maxVal - minVal, 1) * shardNum + 1; if (batchNum > maxElemCount) { batchNum = (int) maxElemCount; } this.batchNum = batchNum; return this; } @Override public ClickHouseShardBetweenParametersProvider calculate() { Serializable[][] parameters = null; Integer[][] shardIdValues = null; float factor = ((float) batchNum) / shardNum; if (factor >= 1) { // e.g. batchNum = 10, shardNum = 3. int minBatchSize = (int) factor; int minBatchNum = (minBatchSize + 1) * shardNum - batchNum; int[] info = allocateShards(minBatchSize, minBatchNum, shardNum); for (int i = 0; i < info.length; i++) { parameters = ArrayUtils.addAll(parameters, divideParameterValues(info[i])); shardIdValues = ArrayUtils.addAll(shardIdValues, repeatShardId(shardIds[i], info[i])); } } else if (factor < 1) { // e.g. batchNum = 10, shardNum = 23. int minBatchSize = (int) (1 / factor); int minBatchNum = (minBatchSize + 1) * batchNum - shardNum; int[] info = allocateShards(minBatchSize, minBatchNum, batchNum); for (int i = 0; i < info.length; i++) { int start = Arrays.stream(ArrayUtils.subarray(info, 0, i)).sum(); parameters = ArrayUtils.addAll(parameters, divideParameterValues(1)); shardIdValues = ArrayUtils.add(shardIdValues, subShardIds(start, info[i], shardIds)); } } this.parameterValues = parameters; this.shardIdValues = shardIdValues; return this; } private Integer[][] repeatShardId(int shardId, int shardNum) { Integer[][] shards = new Integer[shardNum][1]; for (int i = 0; i < shardNum; i++) { shards[i] = new Integer[] {shardId}; } return shards; } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/split/ClickHouseShardTableParametersProvider.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.split; import org.apache.flink.annotation.Experimental; import org.apache.commons.lang3.ArrayUtils; import java.util.Arrays; import static org.apache.flink.util.Preconditions.checkArgument; /** For example, $columnName BETWEEN ? AND ? */ @Experimental public class ClickHouseShardTableParametersProvider extends ClickHouseParametersProvider { private final int[] shardIds; private final int shardNum; public ClickHouseShardTableParametersProvider(int[] shardIds) { checkArgument(shardIds.length > 1, "length of shardIds must be larger than 0"); this.shardIds = shardIds; this.shardNum = shardIds.length; } @Override public String getParameterClause() { return null; } @Override public ClickHouseShardTableParametersProvider ofBatchNum(Integer batchNum) { batchNum = batchNum != null ? batchNum : shardNum; checkArgument(batchNum > 0, "batchNum must be positive"); if (batchNum > shardNum) { batchNum = shardNum; } this.batchNum = batchNum; return this; } @Override public ClickHouseShardTableParametersProvider calculate() { int minBatchSize = shardNum / batchNum; int minBatchNum = (minBatchSize + 1) * batchNum - shardNum; int[] info = allocateShards(minBatchSize, minBatchNum, batchNum); Integer[][] shardIdValues = null; for (int i = 0; i < info.length; i++) { int start = Arrays.stream(ArrayUtils.subarray(info, 0, i)).sum(); shardIdValues = ArrayUtils.add(shardIdValues, subShardIds(start, info[i], shardIds)); } this.shardIdValues = shardIdValues; return this; } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/util/ClickHouseTypeUtil.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.util; import org.apache.flink.table.api.DataTypes; import org.apache.flink.table.catalog.exceptions.CatalogException; import org.apache.flink.table.types.DataType; import ru.yandex.clickhouse.response.ClickHouseColumnInfo; import java.util.regex.Matcher; import java.util.regex.Pattern; import static org.apache.flink.table.types.logical.DecimalType.MAX_PRECISION; /** Type utils. */ public class ClickHouseTypeUtil { private static final Pattern INTERNAL_TYPE_PATTERN = Pattern.compile(".*?\\((?.*)\\)"); /** Convert clickhouse data type to flink data type. Whether to indicate nullable ? */ public static DataType toFlinkType(ClickHouseColumnInfo clickHouseColumnInfo) { switch (clickHouseColumnInfo.getClickHouseDataType()) { case Int8: return DataTypes.TINYINT(); case Int16: case UInt8: return DataTypes.SMALLINT(); case Int32: case UInt16: case IntervalYear: case IntervalMonth: case IntervalWeek: case IntervalDay: case IntervalHour: case IntervalQuarter: case IntervalMinute: case IntervalSecond: return DataTypes.INT(); case Int64: case UInt32: return DataTypes.BIGINT(); case Int128: case Int256: case UInt64: case UInt128: case UInt256: return DataTypes.DECIMAL(MAX_PRECISION, 0); case Float32: return DataTypes.FLOAT(); case Float64: return DataTypes.DOUBLE(); case Decimal: return DataTypes.DECIMAL( clickHouseColumnInfo.getPrecision(), clickHouseColumnInfo.getScale()); case Decimal32: return DataTypes.DECIMAL(9, clickHouseColumnInfo.getScale()); case Decimal64: return DataTypes.DECIMAL(18, clickHouseColumnInfo.getScale()); case Decimal128: case Decimal256: return DataTypes.DECIMAL( Math.min(MAX_PRECISION, clickHouseColumnInfo.getPrecision()), Math.min(MAX_PRECISION, clickHouseColumnInfo.getScale())); case String: case Enum8: case Enum16: return DataTypes.STRING(); case FixedString: case IPv4: case IPv6: case UUID: return DataTypes.VARCHAR(clickHouseColumnInfo.getPrecision()); case Date: return DataTypes.DATE(); case DateTime: case DateTime32: case DateTime64: return DataTypes.TIMESTAMP(clickHouseColumnInfo.getScale()); case Array: String arrayBaseType = getInternalClickHouseType(clickHouseColumnInfo.getOriginalTypeName()); ClickHouseColumnInfo arrayBaseColumnInfo = ClickHouseColumnInfo.parse( arrayBaseType, clickHouseColumnInfo.getColumnName() + ".array_base", clickHouseColumnInfo.getTimeZone()); return DataTypes.ARRAY(toFlinkType(arrayBaseColumnInfo)); case Map: return DataTypes.MAP( toFlinkType(clickHouseColumnInfo.getKeyInfo()), toFlinkType(clickHouseColumnInfo.getValueInfo())); case Tuple: case Nested: case AggregateFunction: default: throw new UnsupportedOperationException( "Unsupported type:" + clickHouseColumnInfo.getClickHouseDataType()); } } private static String getInternalClickHouseType(String clickHouseTypeLiteral) { Matcher matcher = INTERNAL_TYPE_PATTERN.matcher(clickHouseTypeLiteral); if (matcher.find()) { return matcher.group("type"); } else { throw new CatalogException( String.format("No content found in the bucket of '%s'", clickHouseTypeLiteral)); } } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/util/ClickHouseUtil.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.util; import org.apache.commons.lang3.StringUtils; import org.apache.flink.connector.clickhouse.internal.common.DistributedEngineFullSchema; import ru.yandex.clickhouse.ClickHouseConnection; import javax.annotation.Nullable; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Timestamp; import java.time.LocalDate; import java.time.LocalDateTime; import java.time.LocalTime; import java.util.Map; import java.util.Properties; import java.util.regex.Matcher; import java.util.regex.Pattern; import static org.apache.flink.connector.clickhouse.config.ClickHouseConfig.PROPERTIES_PREFIX; /** clickhouse util. */ public class ClickHouseUtil { public static final String EMPTY = ""; private static final LocalDate DATE_PREFIX_OF_TIME = LocalDate.ofEpochDay(1); // 匹配不带宏的分布式表cluster名称 private static final Pattern DISTRIBUTED_TABLE_ENGINE_PATTERN = Pattern.compile( "Distributed\\((?[a-zA-Z_][0-9a-zA-Z_]*),\\s*(?[a-zA-Z_][0-9a-zA-Z_]*),\\s*(?[a-zA-Z_][0-9a-zA-Z_]*)"); //匹配带宏的分布式表cluster名称 private static final Pattern DISTRIBUTED_TABLE_ENGINE_MACROS_PATTERN = Pattern.compile( "Distributed\\((?\\{[a-zA-Z_][0-9a-zA-Z_]*\\}),\\s*(?[a-zA-Z_][0-9a-zA-Z_]*),\\s*(?
[a-zA-Z_][0-9a-zA-Z_]*)"); private static final String QUERY_TABLE_ENGINE_SQL = "SELECT engine_full FROM system.tables WHERE database = ? AND name = ?"; // 查询宏对应实际的集群名称 private static final String QUERY_MACRO_CLUSTER_SQL = "select substitution from system.macros where macro=?"; public static String getJdbcUrl(String url, @Nullable String database) { try { database = database != null ? database : ""; return url + "/" + database; //return "jdbc:" + (new URIBuilder(url)).setPath("/" + database).build().toString(); } catch (Exception e) { throw new IllegalStateException(String.format("Cannot parse url: %s", url), e); } } public static DistributedEngineFullSchema getAndParseDistributedEngineSchema( ClickHouseConnection connection, String databaseName, String tableName) throws SQLException { String engineFull = ""; // 匹配不带宏的分布式表信息 try (PreparedStatement stmt = connection.prepareStatement(QUERY_TABLE_ENGINE_SQL)) { stmt.setString(1, databaseName); stmt.setString(2, tableName); try (ResultSet rs = stmt.executeQuery()) { if (rs.next()) { engineFull = rs.getString("engine_full"); Matcher matcher = DISTRIBUTED_TABLE_ENGINE_PATTERN.matcher(engineFull.replace("'", "")); if (matcher.find()) { String cluster = "cluster"; String database = matcher.group("database"); String table = matcher.group("table"); return DistributedEngineFullSchema.of(cluster, database, table); } } } } // 匹配带宏的分布式表信息 Matcher matcherMacro = DISTRIBUTED_TABLE_ENGINE_MACROS_PATTERN.matcher(engineFull.replace("'", "")); if (matcherMacro.find()) { String macroCluster = "cluster"; String cluster = ""; if (StringUtils.isNotBlank(macroCluster)) { // 根据宏变量查询system.macro表获取真正的虚拟集群名称 try (PreparedStatement stmt = connection.prepareStatement(QUERY_MACRO_CLUSTER_SQL)) { stmt.setString(1, macroCluster); try (ResultSet rs = stmt.executeQuery()) { if (rs.next()) { cluster = rs.getString("substitution"); } } } } if (StringUtils.isBlank(cluster)) { throw new IllegalStateException("没有获取到宏替换真正的集群名称"); } String database = matcherMacro.group("database"); String table = matcherMacro.group("table"); return DistributedEngineFullSchema.of(cluster, database, table); } else { return null; } } public static Properties getClickHouseProperties(Map tableOptions) { final Properties properties = new Properties(); tableOptions.keySet().stream() .filter(key -> key.startsWith(PROPERTIES_PREFIX)) .forEach( key -> { final String value = tableOptions.get(key); final String subKey = key.substring((PROPERTIES_PREFIX).length()); properties.setProperty(subKey, value); }); return properties; } public static Timestamp toFixedDateTimestamp(LocalTime localTime) { LocalDateTime localDateTime = localTime.atDate(DATE_PREFIX_OF_TIME); return Timestamp.valueOf(localDateTime); } public static String quoteIdentifier(String identifier) { return String.join(EMPTY, "`", identifier, "`"); } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/util/FilterPushDownHelper.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.util; import org.apache.flink.table.expressions.CallExpression; import org.apache.flink.table.expressions.FieldReferenceExpression; import org.apache.flink.table.expressions.ResolvedExpression; import org.apache.flink.table.expressions.ValueLiteralExpression; import org.apache.flink.table.functions.BuiltInFunctionDefinitions; import org.apache.flink.table.functions.FunctionDefinition; import ru.yandex.clickhouse.util.ClickHouseValueFormatter; import java.sql.Time; import java.sql.Timestamp; import java.time.Instant; import java.time.LocalTime; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.TimeZone; import java.util.function.Function; import static java.util.stream.Collectors.joining; import static org.apache.flink.connector.clickhouse.util.ClickHouseUtil.EMPTY; import static org.apache.flink.connector.clickhouse.util.ClickHouseUtil.quoteIdentifier; import static org.apache.flink.connector.clickhouse.util.ClickHouseUtil.toFixedDateTimestamp; import static org.apache.flink.connector.clickhouse.util.SqlClause.AND; import static org.apache.flink.connector.clickhouse.util.SqlClause.EQ; import static org.apache.flink.connector.clickhouse.util.SqlClause.GT; import static org.apache.flink.connector.clickhouse.util.SqlClause.GT_EQ; import static org.apache.flink.connector.clickhouse.util.SqlClause.IS_NOT_NULL; import static org.apache.flink.connector.clickhouse.util.SqlClause.IS_NULL; import static org.apache.flink.connector.clickhouse.util.SqlClause.LT; import static org.apache.flink.connector.clickhouse.util.SqlClause.LT_EQ; import static org.apache.flink.connector.clickhouse.util.SqlClause.NOT_EQ; import static org.apache.flink.connector.clickhouse.util.SqlClause.OR; /** Filter push down, convert flink expression to clickhouse filter clause. */ public class FilterPushDownHelper { private static final Map FILTERS = new HashMap<>(); static { FILTERS.put(BuiltInFunctionDefinitions.EQUALS, EQ); FILTERS.put(BuiltInFunctionDefinitions.NOT_EQUALS, NOT_EQ); FILTERS.put(BuiltInFunctionDefinitions.GREATER_THAN, GT); FILTERS.put(BuiltInFunctionDefinitions.GREATER_THAN_OR_EQUAL, GT_EQ); FILTERS.put(BuiltInFunctionDefinitions.LESS_THAN, LT); FILTERS.put(BuiltInFunctionDefinitions.LESS_THAN_OR_EQUAL, LT_EQ); FILTERS.put(BuiltInFunctionDefinitions.IS_NULL, IS_NULL); FILTERS.put(BuiltInFunctionDefinitions.IS_NOT_NULL, IS_NOT_NULL); FILTERS.put(BuiltInFunctionDefinitions.AND, AND); FILTERS.put(BuiltInFunctionDefinitions.OR, OR); } private FilterPushDownHelper() {} public static String convert(List filters) { int filterSize = filters.size(); return filters.stream() .map(expression -> FilterPushDownHelper.convertExpression(expression, filterSize)) .filter(Optional::isPresent) .map(Optional::get) .collect(joining(" AND ")); } private static Optional convertExpression( ResolvedExpression resolvedExpression, int filterSize) { if (!(resolvedExpression instanceof CallExpression)) { return Optional.empty(); } CallExpression call = (CallExpression) resolvedExpression; SqlClause sqlClause = FILTERS.get(call.getFunctionDefinition()); if (sqlClause == null) { return Optional.empty(); } switch (sqlClause) { case EQ: return convertFieldAndLiteral(EQ.formatter, call); case NOT_EQ: return convertFieldAndLiteral(NOT_EQ.formatter, call); case GT: return convertFieldAndLiteral(GT.formatter, call); case GT_EQ: return convertFieldAndLiteral(GT_EQ.formatter, call); case LT: return convertFieldAndLiteral(LT.formatter, call); case LT_EQ: return convertFieldAndLiteral(LT_EQ.formatter, call); case IS_NULL: return convertOnlyChild(IS_NULL.formatter, call); case IS_NOT_NULL: return convertOnlyChild(IS_NOT_NULL.formatter, call); case OR: return convertLogicExpression(OR.formatter, call, filterSize); case AND: return convertLogicExpression(AND.formatter, call, filterSize); default: return Optional.empty(); } } private static Optional convertOnlyChild( Function sqlClauseFormatter, CallExpression call) { List children = call.getResolvedChildren(); if (children.size() != 1) { return Optional.empty(); } ResolvedExpression child = children.get(0); if (!(child instanceof FieldReferenceExpression)) { return Optional.empty(); } FieldReferenceExpression fieldExpression = (FieldReferenceExpression) child; String fieldName = quoteIdentifier(fieldExpression.getName()); return Optional.of(sqlClauseFormatter.apply(new String[] {fieldName})); } private static Optional convertLogicExpression( Function sqlClauseFormatter, CallExpression call, int filterSize) { List args = call.getResolvedChildren(); if (args.size() != 2) { return Optional.empty(); } String left = convertExpression(args.get(0), args.size()).orElse(null); String right = convertExpression(args.get(1), args.size()).orElse(null); if (left == null || right == null) { return Optional.empty(); } String sqlClause = sqlClauseFormatter.apply(new String[] {left, right}); if (filterSize > 1) { sqlClause = String.join(EMPTY, "(", sqlClause, ")"); } return Optional.of(sqlClause); } private static Optional convertFieldAndLiteral( Function sqlClauseFormatter, CallExpression callExpression) { List args = callExpression.getResolvedChildren(); if (args.size() != 2) { return Optional.empty(); } FieldReferenceExpression fieldExpression = args.stream() .filter(expression -> expression instanceof FieldReferenceExpression) .map(expression -> ((FieldReferenceExpression) expression)) .findAny() .orElse(null); ValueLiteralExpression literalExpression = args.stream() .filter(expression -> expression instanceof ValueLiteralExpression) .map(expression -> (ValueLiteralExpression) expression) .findAny() .orElse(null); if (fieldExpression == null || literalExpression == null) { return Optional.empty(); } String fieldName = quoteIdentifier(fieldExpression.getName()); String literalValue = convertLiteral(literalExpression).orElse(null); if (literalValue == null) { return Optional.empty(); } return Optional.of(sqlClauseFormatter.apply(new String[] {fieldName, literalValue})); } private static Optional convertLiteral(ValueLiteralExpression expression) { return expression .getValueAs(expression.getOutputDataType().getLogicalType().getDefaultConversion()) .map( o -> { TimeZone timeZone = getFlinkTimeZone(); String value; if (o instanceof Time) { value = ClickHouseValueFormatter.formatTimestamp( toFixedDateTimestamp(((Time) o).toLocalTime()), timeZone); } else if (o instanceof LocalTime) { value = ClickHouseValueFormatter.formatTimestamp( toFixedDateTimestamp((LocalTime) o), timeZone); } else if (o instanceof Instant) { value = ClickHouseValueFormatter.formatTimestamp( Timestamp.from((Instant) o), timeZone); } else { value = ClickHouseValueFormatter.formatObject( o, timeZone, timeZone); } value = ClickHouseValueFormatter.needsQuoting(o) ? String.join(EMPTY, "'", value, "'") : value; return value; }); } /** TODO The timezone configured via `table.local-time-zone` should be used. */ private static TimeZone getFlinkTimeZone() { return TimeZone.getDefault(); } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/java-flink-1.14/org/apache/flink/connector/clickhouse/util/SqlClause.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.clickhouse.util; import java.util.function.Function; /** SQL filters that support push down. */ public enum SqlClause { EQ(args -> String.format("%s = %s", args[0], args[1])), NOT_EQ(args -> String.format("%s <> %s", args[0], args[1])), GT(args -> String.format("%s > %s", args[0], args[1])), GT_EQ(args -> String.format("%s >= %s", args[0], args[1])), LT(args -> String.format("%s < %s", args[0], args[1])), LT_EQ(args -> String.format("%s <= %s", args[0], args[1])), IS_NULL(args -> String.format("%s IS NULL", args[0])), IS_NOT_NULL(args -> String.format("%s IS NOT NULL", args[0])), AND(args -> String.format("%s AND %s", args[0], args[1])), OR(args -> String.format("%s OR %s", args[0], args[1])); public final Function formatter; SqlClause(final Function function) { this.formatter = function; } } ================================================ FILE: fire-connectors/flink-connectors/flink-clickhouse/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory ================================================ # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. org.apache.flink.connector.clickhouse.ClickHouseDynamicTableFactory org.apache.flink.connector.clickhouse.catalog.ClickHouseCatalogFactory ================================================ FILE: fire-connectors/flink-connectors/flink-es/pom.xml ================================================ 4.0.0 flink-es jar Fire : Connectors : Fink : ElasticSearch com.zto.fire fire-flink-connectors 2.3.2-SNAPSHOT ../pom.xml scala-tools.org Scala-Tools Maven2 Repository http://scala-tools.org/repo-releases scala-tools.org Scala-Tools Maven2 Repository http://scala-tools.org/repo-releases org.scala-lang scala-library ${scala.version} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/pom.xml ================================================ 4.0.0 fire-connector-flink-rocketmq_${flink.reference} jar Fire : Connectors : Fink : RocketMQ com.zto.fire fire-flink-connectors 2.3.2-SNAPSHOT ../pom.xml org.apache.flink flink-java ${flink.version} ${maven.scope} org.apache.flink flink-streaming-java_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-clients_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.rocketmq rocketmq-client ${rocketmq.version} org.apache.rocketmq rocketmq-acl ${rocketmq.version} ${maven.scope} org.apache.rocketmq rocketmq-common ${rocketmq.version} io.netty netty-tcnative commons-lang commons-lang ${commons-lang.version} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java/org/apache/rocketmq/flink/RocketMQConfig.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.Validate; import org.apache.rocketmq.acl.common.AclClientRPCHook; import org.apache.rocketmq.acl.common.SessionCredentials; import org.apache.rocketmq.client.ClientConfig; import org.apache.rocketmq.client.consumer.DefaultMQPullConsumer; import org.apache.rocketmq.client.producer.DefaultMQProducer; import org.apache.rocketmq.common.protocol.heartbeat.MessageModel; import java.util.Properties; import java.util.UUID; import static org.apache.rocketmq.flink.RocketMQUtils.getInteger; /** * RocketMQConfig for Consumer/Producer. */ public class RocketMQConfig { // Server Config public static final String NAME_SERVER_ADDR = "nameserver.address"; // Required public static final String NAME_SERVER_POLL_INTERVAL = "nameserver.poll.interval"; public static final int DEFAULT_NAME_SERVER_POLL_INTERVAL = 30000; // 30 seconds public static final String BROKER_HEART_BEAT_INTERVAL = "brokerserver.heartbeat.interval"; public static final int DEFAULT_BROKER_HEART_BEAT_INTERVAL = 30000; // 30 seconds // Producer related config public static final String PRODUCER_GROUP = "producer.group"; public static final String PRODUCER_RETRY_TIMES = "producer.retry.times"; public static final int DEFAULT_PRODUCER_RETRY_TIMES = 3; public static final String PRODUCER_TIMEOUT = "producer.timeout"; public static final int DEFAULT_PRODUCER_TIMEOUT = 3000; // 3 seconds public static final String ACCESS_KEY = "access.key"; public static final String SECRET_KEY = "secret.key"; // Consumer related config public static final String CONSUMER_GROUP = "consumer.group"; // Required public static final String CONSUMER_TOPIC = "consumer.topic"; // Required public static final String CONSUMER_TAG = "consumer.tag"; public static final String DEFAULT_CONSUMER_TAG = "*"; public static final String CONSUMER_OFFSET_RESET_TO = "consumer.offset.reset.to"; public static final String CONSUMER_OFFSET_LATEST = "latest"; public static final String CONSUMER_OFFSET_EARLIEST = "earliest"; public static final String CONSUMER_OFFSET_TIMESTAMP = "timestamp"; public static final String CONSUMER_OFFSET_FROM_TIMESTAMP = "consumer.offset.from.timestamp"; public static final String CONSUMER_OFFSET_PERSIST_INTERVAL = "consumer.offset.persist.interval"; public static final int DEFAULT_CONSUMER_OFFSET_PERSIST_INTERVAL = 5000; // 5 seconds public static final String CONSUMER_PULL_POOL_SIZE = "consumer.pull.thread.pool.size"; public static final int DEFAULT_CONSUMER_PULL_POOL_SIZE = 20; public static final String CONSUMER_BATCH_SIZE = "consumer.batch.size"; public static final int DEFAULT_CONSUMER_BATCH_SIZE = 32; public static final String CONSUMER_DELAY_WHEN_MESSAGE_NOT_FOUND = "consumer.delay.when.message.not.found"; public static final int DEFAULT_CONSUMER_DELAY_WHEN_MESSAGE_NOT_FOUND = 10; public static final String MSG_DELAY_LEVEL = "msg.delay.level"; public static final int MSG_DELAY_LEVEL00 = 0; // no delay public static final int MSG_DELAY_LEVEL01 = 1; // 1s public static final int MSG_DELAY_LEVEL02 = 2; // 5s public static final int MSG_DELAY_LEVEL03 = 3; // 10s public static final int MSG_DELAY_LEVEL04 = 4; // 30s public static final int MSG_DELAY_LEVEL05 = 5; // 1min public static final int MSG_DELAY_LEVEL06 = 6; // 2min public static final int MSG_DELAY_LEVEL07 = 7; // 3min public static final int MSG_DELAY_LEVEL08 = 8; // 4min public static final int MSG_DELAY_LEVEL09 = 9; // 5min public static final int MSG_DELAY_LEVEL10 = 10; // 6min public static final int MSG_DELAY_LEVEL11 = 11; // 7min public static final int MSG_DELAY_LEVEL12 = 12; // 8min public static final int MSG_DELAY_LEVEL13 = 13; // 9min public static final int MSG_DELAY_LEVEL14 = 14; // 10min public static final int MSG_DELAY_LEVEL15 = 15; // 20min public static final int MSG_DELAY_LEVEL16 = 16; // 30min public static final int MSG_DELAY_LEVEL17 = 17; // 1h public static final int MSG_DELAY_LEVEL18 = 18; // 2h /** * Build Producer Configs. * @param props Properties * @param producer DefaultMQProducer */ public static void buildProducerConfigs(Properties props, DefaultMQProducer producer) { buildCommonConfigs(props, producer); String group = props.getProperty(PRODUCER_GROUP); if (StringUtils.isEmpty(group)) { group = UUID.randomUUID().toString(); } producer.setProducerGroup(props.getProperty(PRODUCER_GROUP, group)); producer.setRetryTimesWhenSendFailed(getInteger(props, PRODUCER_RETRY_TIMES, DEFAULT_PRODUCER_RETRY_TIMES)); producer.setRetryTimesWhenSendAsyncFailed(getInteger(props, PRODUCER_RETRY_TIMES, DEFAULT_PRODUCER_RETRY_TIMES)); producer.setSendMsgTimeout(getInteger(props, PRODUCER_TIMEOUT, DEFAULT_PRODUCER_TIMEOUT)); } /** * Build Consumer Configs. * @param props Properties * @param consumer DefaultMQPushConsumer */ public static void buildConsumerConfigs(Properties props, DefaultMQPullConsumer consumer) { buildCommonConfigs(props, consumer); consumer.setMessageModel(MessageModel.CLUSTERING); consumer.setPersistConsumerOffsetInterval(getInteger(props, CONSUMER_OFFSET_PERSIST_INTERVAL, DEFAULT_CONSUMER_OFFSET_PERSIST_INTERVAL)); } /** * Build Common Configs. * @param props Properties * @param client ClientConfig */ public static void buildCommonConfigs(Properties props, ClientConfig client) { String nameServers = props.getProperty(NAME_SERVER_ADDR); Validate.notEmpty(nameServers); client.setNamesrvAddr(nameServers); client.setPollNameServerInterval(getInteger(props, NAME_SERVER_POLL_INTERVAL, DEFAULT_NAME_SERVER_POLL_INTERVAL)); client.setHeartbeatBrokerInterval(getInteger(props, BROKER_HEART_BEAT_INTERVAL, DEFAULT_BROKER_HEART_BEAT_INTERVAL)); } /** * Build credentials for client. * @param props * @return */ public static AclClientRPCHook buildAclRPCHook(Properties props) { String accessKey = props.getProperty(ACCESS_KEY); String secretKey = props.getProperty(SECRET_KEY); if (!StringUtils.isEmpty(accessKey) && !StringUtils.isEmpty(secretKey)) { AclClientRPCHook aclClientRPCHook = new AclClientRPCHook(new SessionCredentials(accessKey, secretKey)); return aclClientRPCHook; } return null; } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java/org/apache/rocketmq/flink/RocketMQSink.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink; import org.apache.commons.lang.Validate; import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.state.FunctionInitializationContext; import org.apache.flink.runtime.state.FunctionSnapshotContext; import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; import org.apache.rocketmq.client.exception.MQClientException; import org.apache.rocketmq.client.producer.DefaultMQProducer; import org.apache.rocketmq.client.producer.SendCallback; import org.apache.rocketmq.client.producer.SendResult; import org.apache.rocketmq.client.producer.SendStatus; import org.apache.rocketmq.common.message.Message; import org.apache.rocketmq.flink.common.selector.TopicSelector; import org.apache.rocketmq.flink.common.serialization.KeyValueSerializationSchema; import org.apache.rocketmq.remoting.exception.RemotingException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.charset.StandardCharsets; import java.util.LinkedList; import java.util.List; import java.util.Properties; import java.util.UUID; /** * The RocketMQSink provides at-least-once reliability guarantees when * checkpoints are enabled and batchFlushOnCheckpoint(true) is set. * Otherwise, the sink reliability guarantees depends on rocketmq producer's retry policy. */ public class RocketMQSink extends RichSinkFunction implements CheckpointedFunction { private static final long serialVersionUID = 1L; private static final Logger LOG = LoggerFactory.getLogger(RocketMQSink.class); private transient DefaultMQProducer producer; private boolean async; // false by default private Properties props; private TopicSelector topicSelector; private KeyValueSerializationSchema serializationSchema; private boolean batchFlushOnCheckpoint; // false by default private int batchSize = 1000; private List batchList; private int messageDeliveryDelayLevel = RocketMQConfig.MSG_DELAY_LEVEL00; public RocketMQSink(KeyValueSerializationSchema schema, TopicSelector topicSelector, Properties props) { this.serializationSchema = schema; this.topicSelector = topicSelector; this.props = props; if (this.props != null) { this.messageDeliveryDelayLevel = RocketMQUtils.getInteger(this.props, RocketMQConfig.MSG_DELAY_LEVEL, RocketMQConfig.MSG_DELAY_LEVEL00); if (this.messageDeliveryDelayLevel < RocketMQConfig.MSG_DELAY_LEVEL00) { this.messageDeliveryDelayLevel = RocketMQConfig.MSG_DELAY_LEVEL00; } else if (this.messageDeliveryDelayLevel > RocketMQConfig.MSG_DELAY_LEVEL18) { this.messageDeliveryDelayLevel = RocketMQConfig.MSG_DELAY_LEVEL18; } } } @Override public void open(Configuration parameters) throws Exception { Validate.notEmpty(props, "Producer properties can not be empty"); Validate.notNull(topicSelector, "TopicSelector can not be null"); Validate.notNull(serializationSchema, "KeyValueSerializationSchema can not be null"); producer = new DefaultMQProducer(RocketMQConfig.buildAclRPCHook(props)); producer.setInstanceName(String.valueOf(getRuntimeContext().getIndexOfThisSubtask()) + "_" + UUID.randomUUID()); RocketMQConfig.buildProducerConfigs(props, producer); batchList = new LinkedList<>(); if (batchFlushOnCheckpoint && !((StreamingRuntimeContext) getRuntimeContext()).isCheckpointingEnabled()) { LOG.warn("Flushing on checkpoint is enabled, but checkpointing is not enabled. Disabling flushing."); batchFlushOnCheckpoint = false; } try { producer.start(); } catch (MQClientException e) { throw new RuntimeException(e); } } @Override public void invoke(IN input, Context context) throws Exception { Message msg = prepareMessage(input); if (batchFlushOnCheckpoint) { batchList.add(msg); if (batchList.size() >= batchSize) { flushSync(); } return; } if (async) { try { producer.send(msg, new SendCallback() { @Override public void onSuccess(SendResult sendResult) { LOG.debug("Async send message success! result: {}", sendResult); } @Override public void onException(Throwable throwable) { if (throwable != null) { LOG.error("Async send message failure!", throwable); } } }); } catch (Exception e) { LOG.error("Async send message failure!", e); } } else { try { SendResult result = producer.send(msg); LOG.debug("Sync send message result: {}", result); if (result.getSendStatus() != SendStatus.SEND_OK) { throw new RemotingException(result.toString()); } } catch (Exception e) { LOG.error("Sync send message failure!", e); throw e; } } } private Message prepareMessage(IN input) { String topic = topicSelector.getTopic(input); String tag = (tag = topicSelector.getTag(input)) != null ? tag : ""; byte[] k = serializationSchema.serializeKey(input); String key = k != null ? new String(k, StandardCharsets.UTF_8) : ""; byte[] value = serializationSchema.serializeValue(input); Validate.notNull(topic, "the message topic is null"); Validate.notNull(value, "the message body is null"); Message msg = new Message(topic, tag, key, value); if (this.messageDeliveryDelayLevel > RocketMQConfig.MSG_DELAY_LEVEL00) { msg.setDelayTimeLevel(this.messageDeliveryDelayLevel); } return msg; } public RocketMQSink withAsync(boolean async) { this.async = async; return this; } public RocketMQSink withBatchFlushOnCheckpoint(boolean batchFlushOnCheckpoint) { this.batchFlushOnCheckpoint = batchFlushOnCheckpoint; return this; } public RocketMQSink withBatchSize(int batchSize) { this.batchSize = batchSize; return this; } @Override public void close() throws Exception { if (producer != null) { try { flushSync(); } catch (Exception e) { LOG.error("FlushSync failure!", e); } // make sure producer can be shutdown, thus current producerGroup will be unregistered producer.shutdown(); } } private void flushSync() throws Exception { if (batchFlushOnCheckpoint) { synchronized (batchList) { if (batchList.size() > 0) { producer.send(batchList); batchList.clear(); } } } } @Override public void snapshotState(FunctionSnapshotContext context) throws Exception { flushSync(); } @Override public void initializeState(FunctionInitializationContext context) throws Exception { // Nothing to do } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java/org/apache/rocketmq/flink/RocketMQSinkWithTag.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink; import org.apache.commons.lang.Validate; import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.state.FunctionInitializationContext; import org.apache.flink.runtime.state.FunctionSnapshotContext; import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; import org.apache.flink.table.data.RowData; import org.apache.rocketmq.client.exception.MQClientException; import org.apache.rocketmq.client.producer.DefaultMQProducer; import org.apache.rocketmq.client.producer.SendCallback; import org.apache.rocketmq.client.producer.SendResult; import org.apache.rocketmq.client.producer.SendStatus; import org.apache.rocketmq.common.message.Message; import org.apache.rocketmq.flink.common.selector.TopicSelector; import org.apache.rocketmq.flink.common.serialization.JsonSerializationSchema; import org.apache.rocketmq.remoting.exception.RemotingException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.LinkedList; import java.util.List; import java.util.Properties; import java.util.UUID; /** * The RocketMQSink provides at-least-once reliability guarantees when * checkpoints are enabled and batchFlushOnCheckpoint(true) is set. * Otherwise, the sink reliability guarantees depends on rocketmq producer's retry policy. */ public class RocketMQSinkWithTag extends RichSinkFunction implements CheckpointedFunction { private static final long serialVersionUID = 1L; private static final Logger LOG = LoggerFactory.getLogger(RocketMQSink.class); private transient DefaultMQProducer producer; private boolean async; // false by default private Properties props; private TopicSelector topicSelector; private JsonSerializationSchema serializationSchema; private boolean batchFlushOnCheckpoint; // false by default private int batchSize = 1000; private List batchList; private int messageDeliveryDelayLevel = RocketMQConfig.MSG_DELAY_LEVEL00; public RocketMQSinkWithTag(JsonSerializationSchema schema, TopicSelector topicSelector, Properties props) { this.serializationSchema = schema; this.topicSelector = topicSelector; this.props = props; if (this.props != null) { this.messageDeliveryDelayLevel = RocketMQUtils.getInteger(this.props, RocketMQConfig.MSG_DELAY_LEVEL, RocketMQConfig.MSG_DELAY_LEVEL00); if (this.messageDeliveryDelayLevel < RocketMQConfig.MSG_DELAY_LEVEL00) { this.messageDeliveryDelayLevel = RocketMQConfig.MSG_DELAY_LEVEL00; } else if (this.messageDeliveryDelayLevel > RocketMQConfig.MSG_DELAY_LEVEL18) { this.messageDeliveryDelayLevel = RocketMQConfig.MSG_DELAY_LEVEL18; } } } @Override public void open(Configuration parameters) throws Exception { Validate.notEmpty(props, "Producer properties can not be empty"); Validate.notNull(topicSelector, "TopicSelector can not be null"); Validate.notNull(serializationSchema, "KeyValueSerializationSchema can not be null"); producer = new DefaultMQProducer(RocketMQConfig.buildAclRPCHook(props)); producer.setInstanceName(String.valueOf(getRuntimeContext().getIndexOfThisSubtask()) + "_" + UUID.randomUUID()); RocketMQConfig.buildProducerConfigs(props, producer); batchList = new LinkedList<>(); if (batchFlushOnCheckpoint && !((StreamingRuntimeContext) getRuntimeContext()).isCheckpointingEnabled()) { LOG.warn("Flushing on checkpoint is enabled, but checkpointing is not enabled. Disabling flushing."); batchFlushOnCheckpoint = false; } try { producer.start(); } catch (MQClientException e) { throw new RuntimeException(e); } } @Override public void invoke(IN input, Context context) throws Exception { Message msg = prepareMessage(input); if (batchFlushOnCheckpoint) { batchList.add(msg); if (batchList.size() >= batchSize) { flushSync(); } return; } if (async) { try { producer.send(msg, new SendCallback() { @Override public void onSuccess(SendResult sendResult) { LOG.debug("Async send message success! result: {}", sendResult); } @Override public void onException(Throwable throwable) { if (throwable != null) { LOG.error("Async send message failure!", throwable); } } }); } catch (Exception e) { LOG.error("Async send message failure!", e); } } else { try { SendResult result = producer.send(msg); LOG.debug("Sync send message result: {}", result); if (result.getSendStatus() != SendStatus.SEND_OK) { throw new RemotingException(result.toString()); } } catch (Exception e) { LOG.error("Sync send message failure!", e); throw e; } } } private Message prepareMessage(IN input) { /*String topic = topicSelector.getTopic(input); String tag = (tag = topicSelector.getTag(input)) != null ? tag : ""; byte[] k = serializationSchema.serializeKey(input); String key = k != null ? new String(k, StandardCharsets.UTF_8) : ""; byte[] value = serializationSchema.serializeValue(input); Validate.notNull(topic, "the message topic is null"); Validate.notNull(value, "the message body is null"); Message msg = new Message(topic, tag, key, value);*/ RowData rowData = (RowData) input; Message msg = serializationSchema.serialize(rowData); if (this.messageDeliveryDelayLevel > RocketMQConfig.MSG_DELAY_LEVEL00) { msg.setDelayTimeLevel(this.messageDeliveryDelayLevel); } return msg; } public RocketMQSinkWithTag withAsync(boolean async) { this.async = async; return this; } public RocketMQSinkWithTag withBatchFlushOnCheckpoint(boolean batchFlushOnCheckpoint) { this.batchFlushOnCheckpoint = batchFlushOnCheckpoint; return this; } public RocketMQSinkWithTag withBatchSize(int batchSize) { this.batchSize = batchSize; return this; } @Override public void close() throws Exception { if (producer != null) { try { flushSync(); } catch (Exception e) { LOG.error("FlushSync failure!", e); } // make sure producer can be shutdown, thus current producerGroup will be unregistered producer.shutdown(); } } private void flushSync() throws Exception { if (batchFlushOnCheckpoint) { synchronized (batchList) { if (batchList.size() > 0) { producer.send(batchList); batchList.clear(); } } } } @Override public void snapshotState(FunctionSnapshotContext context) throws Exception { flushSync(); } @Override public void initializeState(FunctionInitializationContext context) throws Exception { // Nothing to do } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java/org/apache/rocketmq/flink/RocketMQSource.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE * file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the * License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. */ package org.apache.rocketmq.flink; import org.apache.commons.collections.map.LinkedMap; import org.apache.commons.lang.Validate; import org.apache.flink.api.common.state.ListState; import org.apache.flink.api.common.state.ListStateDescriptor; import org.apache.flink.api.common.typeinfo.TypeHint; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.typeutils.ResultTypeQueryable; import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.state.CheckpointListener; import org.apache.flink.runtime.state.FunctionInitializationContext; import org.apache.flink.runtime.state.FunctionSnapshotContext; import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; import org.apache.rocketmq.client.consumer.*; import org.apache.rocketmq.client.exception.MQClientException; import org.apache.rocketmq.common.message.MessageExt; import org.apache.rocketmq.common.message.MessageQueue; import org.apache.rocketmq.flink.common.serialization.KeyValueDeserializationSchema; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.charset.StandardCharsets; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import static org.apache.rocketmq.flink.RocketMQConfig.*; import static org.apache.rocketmq.flink.RocketMQUtils.getInteger; import static org.apache.rocketmq.flink.RocketMQUtils.getLong; /** * The RocketMQSource is based on RocketMQ pull consumer mode, and provides exactly once reliability guarantees when * checkpoints are enabled. Otherwise, the source doesn't provide any reliability guarantees. */ public class RocketMQSource extends RichParallelSourceFunction implements CheckpointedFunction, CheckpointListener, ResultTypeQueryable { private static final long serialVersionUID = 1L; private static final Logger LOG = LoggerFactory.getLogger(RocketMQSource.class); private transient MQPullConsumerScheduleService pullConsumerScheduleService; private DefaultMQPullConsumer consumer; private KeyValueDeserializationSchema schema; private RunningChecker runningChecker; private transient ListState> unionOffsetStates; private Map offsetTable; private Map restoredOffsets; /** Data for pending but uncommitted offsets. */ private LinkedMap pendingOffsetsToCommit; private Properties props; private String topic; private String group; private static final String OFFSETS_STATE_NAME = "topic-partition-offset-states"; private transient volatile boolean restored; private transient boolean enableCheckpoint; public RocketMQSource(KeyValueDeserializationSchema schema, Properties props) { this.schema = schema; this.props = props; } @Override public void open(Configuration parameters) throws Exception { LOG.debug("source open...."); Validate.notEmpty(props, "Consumer properties can not be empty"); Validate.notNull(schema, "KeyValueDeserializationSchema can not be null"); this.topic = props.getProperty(RocketMQConfig.CONSUMER_TOPIC); this.group = props.getProperty(RocketMQConfig.CONSUMER_GROUP); Validate.notEmpty(topic, "Consumer topic can not be empty"); Validate.notEmpty(group, "Consumer group can not be empty"); this.enableCheckpoint = ((StreamingRuntimeContext) getRuntimeContext()).isCheckpointingEnabled(); if (offsetTable == null) { offsetTable = new ConcurrentHashMap<>(); } if (restoredOffsets == null) { restoredOffsets = new ConcurrentHashMap<>(); } if (pendingOffsetsToCommit == null) { pendingOffsetsToCommit = new LinkedMap(); } runningChecker = new RunningChecker(); //Wait for lite pull consumer pullConsumerScheduleService = new MQPullConsumerScheduleService(group, RocketMQConfig.buildAclRPCHook(props)); consumer = pullConsumerScheduleService.getDefaultMQPullConsumer(); consumer.setInstanceName(String.valueOf(getRuntimeContext().getIndexOfThisSubtask()) + "_" + UUID.randomUUID()); RocketMQConfig.buildConsumerConfigs(props, consumer); } @Override public void run(SourceContext context) throws Exception { LOG.debug("source run...."); // The lock that guarantees that record emission and state updates are atomic, // from the view of taking a checkpoint. final Object lock = context.getCheckpointLock(); int delayWhenMessageNotFound = getInteger(props, RocketMQConfig.CONSUMER_DELAY_WHEN_MESSAGE_NOT_FOUND, RocketMQConfig.DEFAULT_CONSUMER_DELAY_WHEN_MESSAGE_NOT_FOUND); String tag = props.getProperty(RocketMQConfig.CONSUMER_TAG, RocketMQConfig.DEFAULT_CONSUMER_TAG); int pullPoolSize = getInteger(props, RocketMQConfig.CONSUMER_PULL_POOL_SIZE, RocketMQConfig.DEFAULT_CONSUMER_PULL_POOL_SIZE); int pullBatchSize = getInteger(props, RocketMQConfig.CONSUMER_BATCH_SIZE, RocketMQConfig.DEFAULT_CONSUMER_BATCH_SIZE); pullConsumerScheduleService.setPullThreadNums(pullPoolSize); pullConsumerScheduleService.registerPullTaskCallback(topic, new PullTaskCallback() { @Override public void doPullTask(MessageQueue mq, PullTaskContext pullTaskContext) { try { long offset = getMessageQueueOffset(mq); if (offset < 0) { return; } PullResult pullResult = consumer.pull(mq, tag, offset, pullBatchSize); boolean found = false; switch (pullResult.getPullStatus()) { case FOUND: List messages = pullResult.getMsgFoundList(); for (MessageExt msg : messages) { byte[] key = msg.getKeys() != null ? msg.getKeys().getBytes(StandardCharsets.UTF_8) : null; byte[] value = msg.getBody(); OUT data = schema.deserializeKeyAndValue(key, value); // output and state update are atomic synchronized (lock) { context.collectWithTimestamp(data, msg.getBornTimestamp()); } } found = true; break; case NO_MATCHED_MSG: LOG.debug("No matched message after offset {} for queue {}", offset, mq); break; case NO_NEW_MSG: break; case OFFSET_ILLEGAL: LOG.warn("Offset {} is illegal for queue {}", offset, mq); break; default: break; } synchronized (lock) { putMessageQueueOffset(mq, pullResult.getNextBeginOffset()); } if (found) { pullTaskContext.setPullNextDelayTimeMillis(0); // no delay when messages were found } else { pullTaskContext.setPullNextDelayTimeMillis(delayWhenMessageNotFound); } } catch (Exception e) { throw new RuntimeException(e); } } }); try { pullConsumerScheduleService.start(); } catch (MQClientException e) { throw new RuntimeException(e); } runningChecker.setRunning(true); awaitTermination(); } private void awaitTermination() throws InterruptedException { while (runningChecker.isRunning()) { Thread.sleep(50); } } private long getMessageQueueOffset(MessageQueue mq) throws MQClientException { Long offset = offsetTable.get(mq); // restoredOffsets(unionOffsetStates) is the restored global union state; // should only snapshot mqs that actually belong to us if (restored && offset == null) { offset = restoredOffsets.get(mq); } if (offset == null) { offset = consumer.fetchConsumeOffset(mq, false); if (offset < 0) { String initialOffset = props.getProperty(RocketMQConfig.CONSUMER_OFFSET_RESET_TO, CONSUMER_OFFSET_LATEST); switch (initialOffset) { case CONSUMER_OFFSET_EARLIEST: offset = consumer.minOffset(mq); break; case CONSUMER_OFFSET_LATEST: offset = consumer.maxOffset(mq); break; case CONSUMER_OFFSET_TIMESTAMP: offset = consumer.searchOffset(mq, getLong(props, RocketMQConfig.CONSUMER_OFFSET_FROM_TIMESTAMP, System.currentTimeMillis())); break; default: throw new IllegalArgumentException("Unknown value for CONSUMER_OFFSET_RESET_TO."); } } } offsetTable.put(mq, offset); return offsetTable.get(mq); } private void putMessageQueueOffset(MessageQueue mq, long offset) throws MQClientException { offsetTable.put(mq, offset); if (!enableCheckpoint) { consumer.updateConsumeOffset(mq, offset); } } @Override public void cancel() { LOG.debug("cancel ..."); runningChecker.setRunning(false); if (pullConsumerScheduleService != null) { pullConsumerScheduleService.shutdown(); } if (offsetTable != null) { offsetTable.clear(); } if (restoredOffsets != null) { restoredOffsets.clear(); } if (pendingOffsetsToCommit != null) { pendingOffsetsToCommit.clear(); } } @Override public void close() throws Exception { LOG.debug("close ..."); // pretty much the same logic as cancelling try { cancel(); } finally { super.close(); } } @Override public void snapshotState(FunctionSnapshotContext context) throws Exception { // called when a snapshot for a checkpoint is requested if (!runningChecker.isRunning()) { LOG.debug("snapshotState() called on closed source; returning null."); return; } if (LOG.isDebugEnabled()) { LOG.debug("Snapshotting state {} ...", context.getCheckpointId()); } unionOffsetStates.clear(); HashMap currentOffsets = new HashMap<>(offsetTable.size()); // remove the unassigned queues in order to avoid read the wrong offset when the source restart Set assignedQueues = consumer.fetchMessageQueuesInBalance(topic); offsetTable.entrySet().removeIf(item -> !assignedQueues.contains(item.getKey())); for (Map.Entry entry : offsetTable.entrySet()) { unionOffsetStates.add(Tuple2.of(entry.getKey(), entry.getValue())); currentOffsets.put(entry.getKey(), entry.getValue()); } pendingOffsetsToCommit.put(context.getCheckpointId(), currentOffsets); if (LOG.isDebugEnabled()) { LOG.debug("Snapshotted state, last processed offsets: {}, checkpoint id: {}, timestamp: {}", offsetTable, context.getCheckpointId(), context.getCheckpointTimestamp()); } } @Override public void initializeState(FunctionInitializationContext context) throws Exception { // called every time the user-defined function is initialized, // be that when the function is first initialized or be that // when the function is actually recovering from an earlier checkpoint. // Given this, initializeState() is not only the place where different types of state are initialized, // but also where state recovery logic is included. LOG.debug("initialize State ..."); this.unionOffsetStates = context.getOperatorStateStore().getUnionListState(new ListStateDescriptor<>( OFFSETS_STATE_NAME, TypeInformation.of(new TypeHint>() { }))); this.restored = context.isRestored(); if (restored) { if (restoredOffsets == null) { restoredOffsets = new ConcurrentHashMap<>(); } for (Tuple2 mqOffsets : unionOffsetStates.get()) { if (!restoredOffsets.containsKey(mqOffsets.f0) || restoredOffsets.get(mqOffsets.f0) < mqOffsets.f1) { restoredOffsets.put(mqOffsets.f0, mqOffsets.f1); } } LOG.info("Setting restore state in the consumer. Using the following offsets: {}", restoredOffsets); } else { LOG.info("No restore state for the consumer."); } } @Override public TypeInformation getProducedType() { return schema.getProducedType(); } @Override public void notifyCheckpointComplete(long checkpointId) throws Exception { // callback when checkpoint complete if (!runningChecker.isRunning()) { LOG.debug("notifyCheckpointComplete() called on closed source; returning null."); return; } final int posInMap = pendingOffsetsToCommit.indexOf(checkpointId); if (posInMap == -1) { LOG.warn("Received confirmation for unknown checkpoint id {}", checkpointId); return; } Map offsets = (Map) pendingOffsetsToCommit.remove(posInMap); // remove older checkpoints in map for (int i = 0; i < posInMap; i++) { pendingOffsetsToCommit.remove(0); } if (offsets == null || offsets.size() == 0) { LOG.debug("Checkpoint state was empty."); return; } for (Map.Entry entry : offsets.entrySet()) { consumer.updateConsumeOffset(entry.getKey(), entry.getValue()); } } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java/org/apache/rocketmq/flink/RocketMQUtils.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink; import java.util.Properties; public final class RocketMQUtils { public static int getInteger(Properties props, String key, int defaultValue) { return Integer.parseInt(props.getProperty(key, String.valueOf(defaultValue))); } public static long getLong(Properties props, String key, long defaultValue) { return Long.parseLong(props.getProperty(key, String.valueOf(defaultValue))); } public static boolean getBoolean(Properties props, String key, boolean defaultValue) { return Boolean.parseBoolean(props.getProperty(key, String.valueOf(defaultValue))); } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java/org/apache/rocketmq/flink/RunningChecker.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink; import java.io.Serializable; public class RunningChecker implements Serializable { private volatile boolean isRunning = false; public boolean isRunning() { return isRunning; } public void setRunning(boolean running) { isRunning = running; } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java/org/apache/rocketmq/flink/common/selector/DefaultTopicSelector.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink.common.selector; public class DefaultTopicSelector implements TopicSelector { private final String topicName; private final String tagName; public DefaultTopicSelector(final String topicName, final String tagName) { this.topicName = topicName; this.tagName = tagName; } public DefaultTopicSelector(final String topicName) { this(topicName, ""); } @Override public String getTopic(T tuple) { return topicName; } @Override public String getTag(T tuple) { return tagName; } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java/org/apache/rocketmq/flink/common/selector/SimpleTopicSelector.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink.common.selector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Map; /** * Uses field name to select topic and tag name from tuple. */ public class SimpleTopicSelector implements TopicSelector { private static final Logger LOG = LoggerFactory.getLogger(SimpleTopicSelector.class); private final String topicFieldName; private final String defaultTopicName; private final String tagFieldName; private final String defaultTagName; /** * SimpleTopicSelector Constructor. * @param topicFieldName field name used for selecting topic * @param defaultTopicName default field name used for selecting topic * @param tagFieldName field name used for selecting tag * @param defaultTagName default field name used for selecting tag */ public SimpleTopicSelector(String topicFieldName, String defaultTopicName, String tagFieldName, String defaultTagName) { this.topicFieldName = topicFieldName; this.defaultTopicName = defaultTopicName; this.tagFieldName = tagFieldName; this.defaultTagName = defaultTagName; } @Override public String getTopic(Map tuple) { if (tuple.containsKey(topicFieldName)) { Object topic = tuple.get(topicFieldName); return topic != null ? topic.toString() : defaultTopicName; } else { LOG.warn("Field {} Not Found. Returning default topic {}", topicFieldName, defaultTopicName); return defaultTopicName; } } @Override public String getTag(Map tuple) { if (tuple.containsKey(tagFieldName)) { Object tag = tuple.get(tagFieldName); return tag != null ? tag.toString() : defaultTagName; } else { LOG.warn("Field {} Not Found. Returning default tag {}", tagFieldName, defaultTagName); return defaultTagName; } } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java/org/apache/rocketmq/flink/common/selector/TopicSelector.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink.common.selector; import java.io.Serializable; public interface TopicSelector extends Serializable { String getTopic(T tuple); String getTag(T tuple); } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java/org/apache/rocketmq/flink/common/serialization/JsonSerializationSchema.java ================================================ package org.apache.rocketmq.flink.common.serialization; import org.apache.flink.api.common.serialization.SerializationSchema; import org.apache.flink.table.data.RowData; import org.apache.rocketmq.common.message.Message; import javax.annotation.Nullable; /** * 将RowData序列化成rocketmq消息 * @author ChengLong 2021-5-9 13:40:17 */ public class JsonSerializationSchema implements TagKeyValueSerializationSchema { private final String topic; private final @Nullable String tags; private final SerializationSchema valueSerialization; private RowData.FieldGetter[] keyFieldGetters; private RowData.FieldGetter[] valueFieldGetters; public JsonSerializationSchema( String topic, @Nullable String tags, SerializationSchema valueSerialization) { this.topic = topic; this.tags = tags; this.valueSerialization = valueSerialization; } public JsonSerializationSchema( String topic, @Nullable String tags, SerializationSchema valueSerialization, RowData.FieldGetter[] keyFieldGetters, RowData.FieldGetter[] valueFieldGetters) { this.topic = topic; this.tags = tags; this.valueSerialization = valueSerialization; this.keyFieldGetters = keyFieldGetters; this.valueFieldGetters = valueFieldGetters; } @Override public void open(SerializationSchema.InitializationContext context) throws Exception { valueSerialization.open(context); } @Override public Message serialize(RowData consumedRow) { final byte[] valueSerialized = valueSerialization.serialize(consumedRow); return new Message( topic, tags, null, valueSerialized); } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java/org/apache/rocketmq/flink/common/serialization/KeyValueDeserializationSchema.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink.common.serialization; import org.apache.flink.api.java.typeutils.ResultTypeQueryable; import java.io.Serializable; public interface KeyValueDeserializationSchema extends ResultTypeQueryable, Serializable { T deserializeKeyAndValue(byte[] key, byte[] value); } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java/org/apache/rocketmq/flink/common/serialization/KeyValueSerializationSchema.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink.common.serialization; import java.io.Serializable; public interface KeyValueSerializationSchema extends Serializable { byte[] serializeKey(T tuple); byte[] serializeValue(T tuple); } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java/org/apache/rocketmq/flink/common/serialization/SimpleKeyValueDeserializationSchema.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink.common.serialization; import org.apache.flink.api.common.typeinfo.TypeInformation; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; public class SimpleKeyValueDeserializationSchema implements KeyValueDeserializationSchema { public static final String DEFAULT_KEY_FIELD = "key"; public static final String DEFAULT_VALUE_FIELD = "value"; public String keyField; public String valueField; public SimpleKeyValueDeserializationSchema() { this(DEFAULT_KEY_FIELD, DEFAULT_VALUE_FIELD); } /** * SimpleKeyValueDeserializationSchema Constructor. * @param keyField tuple field for selecting the key * @param valueField tuple field for selecting the value */ public SimpleKeyValueDeserializationSchema(String keyField, String valueField) { this.keyField = keyField; this.valueField = valueField; } @Override public Map deserializeKeyAndValue(byte[] key, byte[] value) { HashMap map = new HashMap(2); if (keyField != null) { String k = key != null ? new String(key, StandardCharsets.UTF_8) : null; map.put(keyField, k); } if (valueField != null) { String v = value != null ? new String(value, StandardCharsets.UTF_8) : null; map.put(valueField, v); } return map; } @Override public TypeInformation getProducedType() { return TypeInformation.of(Map.class); } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java/org/apache/rocketmq/flink/common/serialization/SimpleKeyValueSerializationSchema.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink.common.serialization; import java.nio.charset.StandardCharsets; import java.util.Map; public class SimpleKeyValueSerializationSchema implements KeyValueSerializationSchema { public static final String DEFAULT_KEY_FIELD = "key"; public static final String DEFAULT_VALUE_FIELD = "value"; public String keyField; public String valueField; public SimpleKeyValueSerializationSchema() { this(DEFAULT_KEY_FIELD, DEFAULT_VALUE_FIELD); } /** * SimpleKeyValueSerializationSchema Constructor. * @param keyField tuple field for selecting the key * @param valueField tuple field for selecting the value */ public SimpleKeyValueSerializationSchema(String keyField, String valueField) { this.keyField = keyField; this.valueField = valueField; } @Override public byte[] serializeKey(Map tuple) { if (tuple == null || keyField == null) { return null; } Object key = tuple.get(keyField); return key != null ? key.toString().getBytes(StandardCharsets.UTF_8) : null; } @Override public byte[] serializeValue(Map tuple) { if (tuple == null || valueField == null) { return null; } Object value = tuple.get(valueField); return value != null ? value.toString().getBytes(StandardCharsets.UTF_8) : null; } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java/org/apache/rocketmq/flink/common/serialization/TagKeyValueSerializationSchema.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink.common.serialization; import org.apache.flink.api.common.serialization.SerializationSchema; import org.apache.rocketmq.common.message.Message; import java.io.Serializable; /** * 序列化,携带tag信息 * * @author ChengLong 2021-8-17 13:32:21 */ public interface TagKeyValueSerializationSchema extends Serializable { default void open(SerializationSchema.InitializationContext context) throws Exception { } Message serialize(T element); } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java-flink-1.12/org/apache/rocketmq/flink/RocketMQSourceWithTag.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE * file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the * License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. */ package org.apache.rocketmq.flink; import com.esotericsoftware.minlog.Log; import org.apache.commons.collections.map.LinkedMap; import org.apache.commons.lang.Validate; import org.apache.flink.api.common.state.ListState; import org.apache.flink.api.common.state.ListStateDescriptor; import org.apache.flink.api.common.typeinfo.TypeHint; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.typeutils.ResultTypeQueryable; import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.state.CheckpointListener; import org.apache.flink.runtime.state.FunctionInitializationContext; import org.apache.flink.runtime.state.FunctionSnapshotContext; import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; import org.apache.rocketmq.client.consumer.DefaultMQPullConsumer; import org.apache.rocketmq.client.consumer.MQPullConsumerScheduleService; import org.apache.rocketmq.client.consumer.PullResult; import org.apache.rocketmq.client.exception.MQBrokerException; import org.apache.rocketmq.client.exception.MQClientException; import org.apache.rocketmq.common.message.MessageExt; import org.apache.rocketmq.common.message.MessageQueue; import org.apache.rocketmq.flink.common.serialization.TagKeyValueDeserializationSchema; import org.apache.rocketmq.remoting.exception.RemotingException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.charset.StandardCharsets; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import static org.apache.rocketmq.flink.RocketMQConfig.*; import static org.apache.rocketmq.flink.RocketMQUtils.getInteger; import static org.apache.rocketmq.flink.RocketMQUtils.getLong; /** * The RocketMQSource is based on RocketMQ pull consumer mode, and provides exactly once reliability guarantees when * checkpoints are enabled. Otherwise, the source doesn't provide any reliability guarantees. */ public class RocketMQSourceWithTag extends RichParallelSourceFunction implements CheckpointedFunction, CheckpointListener, ResultTypeQueryable { private static final long serialVersionUID = 1L; private static final Logger LOG = LoggerFactory.getLogger(RocketMQSourceWithTag.class); private transient MQPullConsumerScheduleService pullConsumerScheduleService; private DefaultMQPullConsumer consumer; private TagKeyValueDeserializationSchema schema; private RunningChecker runningChecker; private transient ListState> unionOffsetStates; private Map offsetTable; private Map restoredOffsets; /** Data for pending but uncommitted offsets. */ private LinkedMap pendingOffsetsToCommit; private Properties props; private String topic; private String group; private static final String OFFSETS_STATE_NAME = "topic-partition-offset-states-with-tags"; private transient volatile boolean restored; private transient boolean enableCheckpoint; public RocketMQSourceWithTag(TagKeyValueDeserializationSchema schema, Properties props) { this.schema = schema; this.props = props; } @Override public void open(Configuration parameters) throws Exception { LOG.debug("source open...."); Validate.notEmpty(props, "Consumer properties can not be empty"); Validate.notNull(schema, "TagKeyValueDeserializationSchema can not be null"); this.topic = props.getProperty(RocketMQConfig.CONSUMER_TOPIC); this.group = props.getProperty(RocketMQConfig.CONSUMER_GROUP); Validate.notEmpty(topic, "Consumer topic can not be empty"); Validate.notEmpty(group, "Consumer group can not be empty"); this.enableCheckpoint = ((StreamingRuntimeContext) getRuntimeContext()).isCheckpointingEnabled(); if (offsetTable == null) { offsetTable = new ConcurrentHashMap<>(); } if (restoredOffsets == null) { restoredOffsets = new ConcurrentHashMap<>(); } if (pendingOffsetsToCommit == null) { pendingOffsetsToCommit = new LinkedMap(); } runningChecker = new RunningChecker(); //Wait for lite pull consumer pullConsumerScheduleService = new MQPullConsumerScheduleService(group, RocketMQConfig.buildAclRPCHook(props)); consumer = pullConsumerScheduleService.getDefaultMQPullConsumer(); consumer.setInstanceName(getRuntimeContext().getIndexOfThisSubtask() + "_" + UUID.randomUUID()); RocketMQConfig.buildConsumerConfigs(props, consumer); } @Override public void run(SourceContext context) throws Exception { LOG.debug("source run...."); // The lock that guarantees that record emission and state updates are atomic, // from the view of taking a checkpoint. final Object lock = context.getCheckpointLock(); int delayWhenMessageNotFound = getInteger(props, RocketMQConfig.CONSUMER_DELAY_WHEN_MESSAGE_NOT_FOUND, RocketMQConfig.DEFAULT_CONSUMER_DELAY_WHEN_MESSAGE_NOT_FOUND); String tag = props.getProperty(RocketMQConfig.CONSUMER_TAG, RocketMQConfig.DEFAULT_CONSUMER_TAG); int pullPoolSize = getInteger(props, RocketMQConfig.CONSUMER_PULL_POOL_SIZE, RocketMQConfig.DEFAULT_CONSUMER_PULL_POOL_SIZE); int pullBatchSize = getInteger(props, RocketMQConfig.CONSUMER_BATCH_SIZE, RocketMQConfig.DEFAULT_CONSUMER_BATCH_SIZE); pullConsumerScheduleService.setPullThreadNums(pullPoolSize); pullConsumerScheduleService.registerPullTaskCallback(topic, (mq, pullTaskContext) -> { try { long offset = getMessageQueueOffset(mq); if (offset < 0) { return; } Log.debug("Current pullBatchSize is: " + pullBatchSize); PullResult pullResult = consumer.pull(mq, tag, offset, pullBatchSize); boolean found = false; switch (pullResult.getPullStatus()) { case FOUND: List messages = pullResult.getMsgFoundList(); if (pullBatchSize != messages.size()) LOG.debug("Pull from rocketmq records is: {}", messages.size()); for (MessageExt msg : messages) { byte[] tag1 = msg.getTags() != null ? msg.getTags().getBytes(StandardCharsets.UTF_8) : null; byte[] key = msg.getKeys() != null ? msg.getKeys().getBytes(StandardCharsets.UTF_8) : null; byte[] value = msg.getBody(); OUT data = schema.deserializeTagKeyAndValue(tag1, key, value); // output and state update are atomic synchronized (lock) { context.collectWithTimestamp(data, msg.getBornTimestamp()); } } found = true; break; case NO_MATCHED_MSG: LOG.debug("No matched message after offset {} for queue {}", offset, mq); break; case NO_NEW_MSG: break; case OFFSET_ILLEGAL: LOG.warn("Offset {} is illegal for queue {}", offset, mq); break; default: break; } synchronized (lock) { putMessageQueueOffset(mq, pullResult.getNextBeginOffset()); } if (found) { pullTaskContext.setPullNextDelayTimeMillis(0); // no delay when messages were found } else { pullTaskContext.setPullNextDelayTimeMillis(delayWhenMessageNotFound); } } catch (Exception e) { throw new RuntimeException(e); } }); try { pullConsumerScheduleService.start(); } catch (MQClientException e) { throw new RuntimeException(e); } runningChecker.setRunning(true); awaitTermination(); } private void awaitTermination() throws InterruptedException { while (runningChecker.isRunning()) { Thread.sleep(50); } } private long getMessageQueueOffset(MessageQueue mq) throws MQClientException { Long offset = offsetTable.get(mq); // restoredOffsets(unionOffsetStates) is the restored global union state; // should only snapshot mqs that actually belong to us if (restored && offset == null) { offset = restoredOffsets.get(mq); } if (offset == null) { LOG.debug("从状态中获取Offset列表为空,将从server端获取offset列表"); offset = consumer.fetchConsumeOffset(mq, true); if (offset < 0) { String initialOffset = props.getProperty(RocketMQConfig.CONSUMER_OFFSET_RESET_TO, CONSUMER_OFFSET_LATEST); switch (initialOffset) { case CONSUMER_OFFSET_EARLIEST: offset = consumer.minOffset(mq); break; case CONSUMER_OFFSET_LATEST: offset = consumer.maxOffset(mq); break; case CONSUMER_OFFSET_TIMESTAMP: offset = consumer.searchOffset(mq, getLong(props, RocketMQConfig.CONSUMER_OFFSET_FROM_TIMESTAMP, System.currentTimeMillis())); break; default: throw new IllegalArgumentException("Unknown value for CONSUMER_OFFSET_RESET_TO."); } } } offsetTable.put(mq, offset); return offsetTable.get(mq); } private void putMessageQueueOffset(MessageQueue mq, long offset) throws MQClientException, RemotingException, InterruptedException, MQBrokerException { offsetTable.put(mq, offset); if (!enableCheckpoint) { consumer.updateConsumeOffset(mq, offset); // consumer.getOffsetStore().updateConsumeOffsetToBroker(mq,offset,true); } } @Override public void cancel() { LOG.debug("cancel ..."); runningChecker.setRunning(false); if (pullConsumerScheduleService != null) { pullConsumerScheduleService.shutdown(); } if (offsetTable != null) { offsetTable.clear(); } if (restoredOffsets != null) { restoredOffsets.clear(); } if (pendingOffsetsToCommit != null) { pendingOffsetsToCommit.clear(); } } @Override public void close() throws Exception { LOG.debug("close ..."); // pretty much the same logic as cancelling try { cancel(); } finally { super.close(); } } @Override public void snapshotState(FunctionSnapshotContext context) throws Exception { // called when a snapshot for a checkpoint is requested if (!runningChecker.isRunning()) { LOG.debug("snapshotState() called on closed source; returning null."); return; } if (LOG.isDebugEnabled()) { LOG.debug("Snapshotting state {} ...", context.getCheckpointId()); } unionOffsetStates.clear(); HashMap currentOffsets = new HashMap<>(offsetTable.size()); // remove the unassigned queues in order to avoid read the wrong offset when the source restart Set assignedQueues = consumer.fetchMessageQueuesInBalance(topic); offsetTable.entrySet().removeIf(item -> !assignedQueues.contains(item.getKey())); for (Map.Entry entry : offsetTable.entrySet()) { unionOffsetStates.add(Tuple2.of(entry.getKey(), entry.getValue())); currentOffsets.put(entry.getKey(), entry.getValue()); } pendingOffsetsToCommit.put(context.getCheckpointId(), currentOffsets); if (LOG.isDebugEnabled()) { LOG.debug("Snapshotted state, last processed offsets: {}, checkpoint id: {}, timestamp: {}", offsetTable, context.getCheckpointId(), context.getCheckpointTimestamp()); } } @Override public void initializeState(FunctionInitializationContext context) throws Exception { // called every time the user-defined function is initialized, // be that when the function is first initialized or be that // when the function is actually recovering from an earlier checkpoint. // Given this, initializeState() is not only the place where different types of state are initialized, // but also where state recovery logic is included. LOG.debug("initialize State ..."); this.unionOffsetStates = context.getOperatorStateStore().getUnionListState(new ListStateDescriptor<>( OFFSETS_STATE_NAME, TypeInformation.of(new TypeHint>() {}))); this.restored = context.isRestored(); if (restored) { if (restoredOffsets == null) { restoredOffsets = new ConcurrentHashMap<>(); } for (Tuple2 mqOffsets : unionOffsetStates.get()) { if (!restoredOffsets.containsKey(mqOffsets.f0) || restoredOffsets.get(mqOffsets.f0) < mqOffsets.f1) { restoredOffsets.put(mqOffsets.f0, mqOffsets.f1); } } LOG.info("Setting restore state in the consumer. Using the following offsets: {}", restoredOffsets); } else { LOG.info("No restore state for the consumer."); } } @Override public TypeInformation getProducedType() { return schema.getProducedType(); } @Override public void notifyCheckpointComplete(long checkpointId) throws Exception { // callback when checkpoint complete if (!runningChecker.isRunning()) { LOG.debug("notifyCheckpointComplete() called on closed source; returning null."); return; } final int posInMap = pendingOffsetsToCommit.indexOf(checkpointId); if (posInMap == -1) { LOG.warn("Received confirmation for unknown checkpoint id {}", checkpointId); return; } Map offsets = (Map) pendingOffsetsToCommit.remove(posInMap); // remove older checkpoints in map for (int i = 0; i < posInMap; i++) { pendingOffsetsToCommit.remove(0); } if (offsets == null || offsets.size() == 0) { LOG.debug("Checkpoint state was empty."); return; } for (Map.Entry entry : offsets.entrySet()) { consumer.updateConsumeOffset(entry.getKey(), entry.getValue()); } } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java-flink-1.12/org/apache/rocketmq/flink/common/serialization/JsonDeserializationSchema.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink.common.serialization; import org.apache.flink.api.common.serialization.DeserializationSchema; import org.apache.flink.api.common.typeinfo.TypeHint; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.table.data.RowData; import java.io.IOException; /** * 将rocketmq消息反序列化成RowData * @author ChengLong 2021-5-9 13:40:17 */ public class JsonDeserializationSchema implements TagKeyValueDeserializationSchema { private DeserializationSchema key; private DeserializationSchema value; public JsonDeserializationSchema(DeserializationSchema key, DeserializationSchema value) { this.key = key; this.value = value; } @Override public RowData deserializeTagKeyAndValue(byte[] tag, byte[] key, byte[] value) { /*String keyString = key != null ? new String(key, StandardCharsets.UTF_8) : null; String valueString = value != null ? new String(value, StandardCharsets.UTF_8) : null;*/ if (value != null) { try { // 调用sql connector的format进行反序列化 return this.value.deserialize(value); } catch (IOException e) { e.printStackTrace(); } } return null; } @Override public TypeInformation getProducedType() { return TypeInformation.of(new TypeHint(){}); } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java-flink-1.12/org/apache/rocketmq/flink/common/serialization/SimpleTagKeyValueDeserializationSchema.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink.common.serialization; import org.apache.flink.api.common.typeinfo.TypeHint; import org.apache.flink.api.common.typeinfo.TypeInformation; import scala.Tuple3; import java.nio.charset.StandardCharsets; /** * 反序列化MessageExt,将tag、key、value以tuple3方式返回 * * @author ChengLong 2021-5-10 09:44:55 */ public class SimpleTagKeyValueDeserializationSchema implements TagKeyValueDeserializationSchema> { @Override public Tuple3 deserializeTagKeyAndValue(byte[] tag, byte[] key, byte[] value) { String tagString = tag != null ? new String(tag, StandardCharsets.UTF_8) : null; String keyString = key != null ? new String(key, StandardCharsets.UTF_8) : null; String valueString = value != null ? new String(value, StandardCharsets.UTF_8) : null; return new Tuple3<>(tagString, keyString, valueString); } @Override public TypeInformation> getProducedType() { return TypeInformation.of(new TypeHint>(){}); } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java-flink-1.12/org/apache/rocketmq/flink/common/serialization/TagKeyValueDeserializationSchema.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink.common.serialization; import org.apache.flink.api.java.typeutils.ResultTypeQueryable; import java.io.Serializable; /** * 反序列化,携带tag信息 * @author ChengLong 2021-5-10 09:43:35 */ public interface TagKeyValueDeserializationSchema extends ResultTypeQueryable, Serializable { T deserializeTagKeyAndValue(byte[] tag, byte[] key, byte[] value); } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java-flink-1.13/org/apache/rocketmq/flink/RocketMQSourceWithTag.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE * file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the * License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. */ package org.apache.rocketmq.flink; import com.esotericsoftware.minlog.Log; import org.apache.commons.collections.map.LinkedMap; import org.apache.commons.lang.Validate; import org.apache.flink.api.common.state.ListState; import org.apache.flink.api.common.state.ListStateDescriptor; import org.apache.flink.api.common.typeinfo.TypeHint; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.typeutils.ResultTypeQueryable; import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.state.CheckpointListener; import org.apache.flink.runtime.state.FunctionInitializationContext; import org.apache.flink.runtime.state.FunctionSnapshotContext; import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; import org.apache.rocketmq.client.consumer.DefaultMQPullConsumer; import org.apache.rocketmq.client.consumer.MQPullConsumerScheduleService; import org.apache.rocketmq.client.consumer.PullResult; import org.apache.rocketmq.client.exception.MQBrokerException; import org.apache.rocketmq.client.exception.MQClientException; import org.apache.rocketmq.common.message.MessageExt; import org.apache.rocketmq.common.message.MessageQueue; import org.apache.rocketmq.flink.common.serialization.TagKeyValueDeserializationSchema; import org.apache.rocketmq.remoting.exception.RemotingException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.charset.StandardCharsets; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import static org.apache.rocketmq.flink.RocketMQConfig.*; import static org.apache.rocketmq.flink.RocketMQUtils.getInteger; import static org.apache.rocketmq.flink.RocketMQUtils.getLong; /** * The RocketMQSource is based on RocketMQ pull consumer mode, and provides exactly once reliability guarantees when * checkpoints are enabled. Otherwise, the source doesn't provide any reliability guarantees. */ public class RocketMQSourceWithTag extends RichParallelSourceFunction implements CheckpointedFunction, CheckpointListener, ResultTypeQueryable { private static final long serialVersionUID = 1L; private static final Logger LOG = LoggerFactory.getLogger(RocketMQSourceWithTag.class); private transient MQPullConsumerScheduleService pullConsumerScheduleService; private DefaultMQPullConsumer consumer; private TagKeyValueDeserializationSchema schema; private RunningChecker runningChecker; private transient ListState> unionOffsetStates; private Map offsetTable; private Map restoredOffsets; /** Data for pending but uncommitted offsets. */ private LinkedMap pendingOffsetsToCommit; private Properties props; private String topic; private String group; private static final String OFFSETS_STATE_NAME = "topic-partition-offset-states-with-tags"; private transient volatile boolean restored; private transient boolean enableCheckpoint; public RocketMQSourceWithTag(TagKeyValueDeserializationSchema schema, Properties props) { this.schema = schema; this.props = props; } @Override public void open(Configuration parameters) throws Exception { LOG.debug("source open...."); Validate.notEmpty(props, "Consumer properties can not be empty"); Validate.notNull(schema, "TagKeyValueDeserializationSchema can not be null"); this.topic = props.getProperty(RocketMQConfig.CONSUMER_TOPIC); this.group = props.getProperty(RocketMQConfig.CONSUMER_GROUP); Validate.notEmpty(topic, "Consumer topic can not be empty"); Validate.notEmpty(group, "Consumer group can not be empty"); this.enableCheckpoint = ((StreamingRuntimeContext) getRuntimeContext()).isCheckpointingEnabled(); if (offsetTable == null) { offsetTable = new ConcurrentHashMap<>(); } if (restoredOffsets == null) { restoredOffsets = new ConcurrentHashMap<>(); } if (pendingOffsetsToCommit == null) { pendingOffsetsToCommit = new LinkedMap(); } runningChecker = new RunningChecker(); //Wait for lite pull consumer pullConsumerScheduleService = new MQPullConsumerScheduleService(group, RocketMQConfig.buildAclRPCHook(props)); consumer = pullConsumerScheduleService.getDefaultMQPullConsumer(); consumer.setInstanceName(getRuntimeContext().getIndexOfThisSubtask() + "_" + UUID.randomUUID()); RocketMQConfig.buildConsumerConfigs(props, consumer); } @Override public void run(SourceContext context) throws Exception { LOG.debug("source run...."); // The lock that guarantees that record emission and state updates are atomic, // from the view of taking a checkpoint. final Object lock = context.getCheckpointLock(); int delayWhenMessageNotFound = getInteger(props, RocketMQConfig.CONSUMER_DELAY_WHEN_MESSAGE_NOT_FOUND, RocketMQConfig.DEFAULT_CONSUMER_DELAY_WHEN_MESSAGE_NOT_FOUND); String tag = props.getProperty(RocketMQConfig.CONSUMER_TAG, RocketMQConfig.DEFAULT_CONSUMER_TAG); int pullPoolSize = getInteger(props, RocketMQConfig.CONSUMER_PULL_POOL_SIZE, RocketMQConfig.DEFAULT_CONSUMER_PULL_POOL_SIZE); int pullBatchSize = getInteger(props, RocketMQConfig.CONSUMER_BATCH_SIZE, RocketMQConfig.DEFAULT_CONSUMER_BATCH_SIZE); pullConsumerScheduleService.setPullThreadNums(pullPoolSize); pullConsumerScheduleService.registerPullTaskCallback(topic, (mq, pullTaskContext) -> { try { long offset = getMessageQueueOffset(mq); if (offset < 0) { return; } Log.debug("Current pullBatchSize is: " + pullBatchSize); PullResult pullResult = consumer.pull(mq, tag, offset, pullBatchSize); boolean found = false; switch (pullResult.getPullStatus()) { case FOUND: List messages = pullResult.getMsgFoundList(); if (pullBatchSize != messages.size()) LOG.debug("Pull from rocketmq records is: {}", messages.size()); for (MessageExt msg : messages) { byte[] tag1 = msg.getTags() != null ? msg.getTags().getBytes(StandardCharsets.UTF_8) : null; byte[] key = msg.getKeys() != null ? msg.getKeys().getBytes(StandardCharsets.UTF_8) : null; byte[] value = msg.getBody(); OUT data = schema.deserializeTagKeyAndValue(tag1, key, value); // output and state update are atomic synchronized (lock) { context.collectWithTimestamp(data, msg.getBornTimestamp()); } } found = true; break; case NO_MATCHED_MSG: LOG.debug("No matched message after offset {} for queue {}", offset, mq); break; case NO_NEW_MSG: break; case OFFSET_ILLEGAL: LOG.warn("Offset {} is illegal for queue {}", offset, mq); break; default: break; } synchronized (lock) { putMessageQueueOffset(mq, pullResult.getNextBeginOffset()); } if (found) { pullTaskContext.setPullNextDelayTimeMillis(0); // no delay when messages were found } else { pullTaskContext.setPullNextDelayTimeMillis(delayWhenMessageNotFound); } } catch (Exception e) { throw new RuntimeException(e); } }); try { pullConsumerScheduleService.start(); } catch (MQClientException e) { throw new RuntimeException(e); } runningChecker.setRunning(true); awaitTermination(); } private void awaitTermination() throws InterruptedException { while (runningChecker.isRunning()) { Thread.sleep(50); } } private long getMessageQueueOffset(MessageQueue mq) throws MQClientException { Long offset = offsetTable.get(mq); // restoredOffsets(unionOffsetStates) is the restored global union state; // should only snapshot mqs that actually belong to us if (restored && offset == null) { offset = restoredOffsets.get(mq); } if (offset == null) { LOG.debug("从状态中获取Offset列表为空,将从server端获取offset列表"); offset = consumer.fetchConsumeOffset(mq, true); if (offset < 0) { String initialOffset = props.getProperty(RocketMQConfig.CONSUMER_OFFSET_RESET_TO, CONSUMER_OFFSET_LATEST); switch (initialOffset) { case CONSUMER_OFFSET_EARLIEST: offset = consumer.minOffset(mq); break; case CONSUMER_OFFSET_LATEST: offset = consumer.maxOffset(mq); break; case CONSUMER_OFFSET_TIMESTAMP: offset = consumer.searchOffset(mq, getLong(props, RocketMQConfig.CONSUMER_OFFSET_FROM_TIMESTAMP, System.currentTimeMillis())); break; default: throw new IllegalArgumentException("Unknown value for CONSUMER_OFFSET_RESET_TO."); } } } offsetTable.put(mq, offset); return offsetTable.get(mq); } private void putMessageQueueOffset(MessageQueue mq, long offset) throws MQClientException, RemotingException, InterruptedException, MQBrokerException { offsetTable.put(mq, offset); if (!enableCheckpoint) { consumer.updateConsumeOffset(mq, offset); // consumer.getOffsetStore().updateConsumeOffsetToBroker(mq,offset,true); } } @Override public void cancel() { LOG.debug("cancel ..."); runningChecker.setRunning(false); if (pullConsumerScheduleService != null) { pullConsumerScheduleService.shutdown(); } if (offsetTable != null) { offsetTable.clear(); } if (restoredOffsets != null) { restoredOffsets.clear(); } if (pendingOffsetsToCommit != null) { pendingOffsetsToCommit.clear(); } } @Override public void close() throws Exception { LOG.debug("close ..."); // pretty much the same logic as cancelling try { cancel(); } finally { super.close(); } } @Override public void snapshotState(FunctionSnapshotContext context) throws Exception { // called when a snapshot for a checkpoint is requested if (!runningChecker.isRunning()) { LOG.debug("snapshotState() called on closed source; returning null."); return; } if (LOG.isDebugEnabled()) { LOG.debug("Snapshotting state {} ...", context.getCheckpointId()); } unionOffsetStates.clear(); HashMap currentOffsets = new HashMap<>(offsetTable.size()); // remove the unassigned queues in order to avoid read the wrong offset when the source restart Set assignedQueues = consumer.fetchMessageQueuesInBalance(topic); offsetTable.entrySet().removeIf(item -> !assignedQueues.contains(item.getKey())); for (Map.Entry entry : offsetTable.entrySet()) { unionOffsetStates.add(Tuple2.of(entry.getKey(), entry.getValue())); currentOffsets.put(entry.getKey(), entry.getValue()); } pendingOffsetsToCommit.put(context.getCheckpointId(), currentOffsets); if (LOG.isDebugEnabled()) { LOG.debug("Snapshotted state, last processed offsets: {}, checkpoint id: {}, timestamp: {}", offsetTable, context.getCheckpointId(), context.getCheckpointTimestamp()); } } @Override public void initializeState(FunctionInitializationContext context) throws Exception { // called every time the user-defined function is initialized, // be that when the function is first initialized or be that // when the function is actually recovering from an earlier checkpoint. // Given this, initializeState() is not only the place where different types of state are initialized, // but also where state recovery logic is included. LOG.debug("initialize State ..."); this.unionOffsetStates = context.getOperatorStateStore().getUnionListState(new ListStateDescriptor<>( OFFSETS_STATE_NAME, TypeInformation.of(new TypeHint>() {}))); this.restored = context.isRestored(); if (restored) { if (restoredOffsets == null) { restoredOffsets = new ConcurrentHashMap<>(); } for (Tuple2 mqOffsets : unionOffsetStates.get()) { if (!restoredOffsets.containsKey(mqOffsets.f0) || restoredOffsets.get(mqOffsets.f0) < mqOffsets.f1) { restoredOffsets.put(mqOffsets.f0, mqOffsets.f1); } } LOG.info("Setting restore state in the consumer. Using the following offsets: {}", restoredOffsets); } else { LOG.info("No restore state for the consumer."); } } @Override public TypeInformation getProducedType() { return schema.getProducedType(); } @Override public void notifyCheckpointComplete(long checkpointId) throws Exception { // callback when checkpoint complete if (!runningChecker.isRunning()) { LOG.debug("notifyCheckpointComplete() called on closed source; returning null."); return; } final int posInMap = pendingOffsetsToCommit.indexOf(checkpointId); if (posInMap == -1) { LOG.warn("Received confirmation for unknown checkpoint id {}", checkpointId); return; } Map offsets = (Map) pendingOffsetsToCommit.remove(posInMap); // remove older checkpoints in map for (int i = 0; i < posInMap; i++) { pendingOffsetsToCommit.remove(0); } if (offsets == null || offsets.size() == 0) { LOG.debug("Checkpoint state was empty."); return; } for (Map.Entry entry : offsets.entrySet()) { consumer.updateConsumeOffset(entry.getKey(), entry.getValue()); } } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java-flink-1.13/org/apache/rocketmq/flink/common/serialization/JsonDeserializationSchema.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink.common.serialization; import org.apache.flink.api.common.serialization.DeserializationSchema; import org.apache.flink.api.common.typeinfo.TypeHint; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.table.data.RowData; import java.io.IOException; /** * 将rocketmq消息反序列化成RowData * @author ChengLong 2021-5-9 13:40:17 */ public class JsonDeserializationSchema implements TagKeyValueDeserializationSchema { private DeserializationSchema key; private DeserializationSchema value; public JsonDeserializationSchema(DeserializationSchema key, DeserializationSchema value) { this.key = key; this.value = value; } @Override public RowData deserializeTagKeyAndValue(byte[] tag, byte[] key, byte[] value) { /*String keyString = key != null ? new String(key, StandardCharsets.UTF_8) : null; String valueString = value != null ? new String(value, StandardCharsets.UTF_8) : null;*/ if (value != null) { try { // 调用sql connector的format进行反序列化 return this.value.deserialize(value); } catch (IOException e) { e.printStackTrace(); } } return null; } @Override public TypeInformation getProducedType() { return TypeInformation.of(new TypeHint(){}); } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java-flink-1.13/org/apache/rocketmq/flink/common/serialization/SimpleTagKeyValueDeserializationSchema.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink.common.serialization; import org.apache.flink.api.common.typeinfo.TypeHint; import org.apache.flink.api.common.typeinfo.TypeInformation; import scala.Tuple3; import java.nio.charset.StandardCharsets; /** * 反序列化MessageExt,将tag、key、value以tuple3方式返回 * * @author ChengLong 2021-5-10 09:44:55 */ public class SimpleTagKeyValueDeserializationSchema implements TagKeyValueDeserializationSchema> { @Override public Tuple3 deserializeTagKeyAndValue(byte[] tag, byte[] key, byte[] value) { String tagString = tag != null ? new String(tag, StandardCharsets.UTF_8) : null; String keyString = key != null ? new String(key, StandardCharsets.UTF_8) : null; String valueString = value != null ? new String(value, StandardCharsets.UTF_8) : null; return new Tuple3<>(tagString, keyString, valueString); } @Override public TypeInformation> getProducedType() { return TypeInformation.of(new TypeHint>(){}); } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java-flink-1.13/org/apache/rocketmq/flink/common/serialization/TagKeyValueDeserializationSchema.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink.common.serialization; import org.apache.flink.api.java.typeutils.ResultTypeQueryable; import java.io.Serializable; /** * 反序列化,携带tag信息 * @author ChengLong 2021-5-10 09:43:35 */ public interface TagKeyValueDeserializationSchema extends ResultTypeQueryable, Serializable { T deserializeTagKeyAndValue(byte[] tag, byte[] key, byte[] value); } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java-flink-1.14/org/apache/rocketmq/flink/RocketMQSourceWithTag.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE * file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the * License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. */ package org.apache.rocketmq.flink; import com.esotericsoftware.minlog.Log; import org.apache.commons.collections.map.LinkedMap; import org.apache.commons.lang.Validate; import org.apache.flink.api.common.state.ListState; import org.apache.flink.api.common.state.ListStateDescriptor; import org.apache.flink.api.common.typeinfo.TypeHint; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.typeutils.ResultTypeQueryable; import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.state.CheckpointListener; import org.apache.flink.runtime.state.FunctionInitializationContext; import org.apache.flink.runtime.state.FunctionSnapshotContext; import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; import org.apache.rocketmq.client.consumer.DefaultMQPullConsumer; import org.apache.rocketmq.client.consumer.MQPullConsumerScheduleService; import org.apache.rocketmq.client.consumer.PullResult; import org.apache.rocketmq.client.exception.MQBrokerException; import org.apache.rocketmq.client.exception.MQClientException; import org.apache.rocketmq.common.message.MessageExt; import org.apache.rocketmq.common.message.MessageQueue; import org.apache.rocketmq.flink.common.serialization.TagKeyValueDeserializationSchema; import org.apache.rocketmq.remoting.exception.RemotingException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.charset.StandardCharsets; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import static org.apache.rocketmq.flink.RocketMQConfig.*; import static org.apache.rocketmq.flink.RocketMQUtils.getInteger; import static org.apache.rocketmq.flink.RocketMQUtils.getLong; /** * The RocketMQSource is based on RocketMQ pull consumer mode, and provides exactly once reliability guarantees when * checkpoints are enabled. Otherwise, the source doesn't provide any reliability guarantees. */ public class RocketMQSourceWithTag extends RichParallelSourceFunction implements CheckpointedFunction, CheckpointListener, ResultTypeQueryable { private static final long serialVersionUID = 1L; private static final Logger LOG = LoggerFactory.getLogger(RocketMQSourceWithTag.class); private transient MQPullConsumerScheduleService pullConsumerScheduleService; private DefaultMQPullConsumer consumer; private TagKeyValueDeserializationSchema schema; private RunningChecker runningChecker; private transient ListState> unionOffsetStates; private Map offsetTable; private Map restoredOffsets; /** Data for pending but uncommitted offsets. */ private LinkedMap pendingOffsetsToCommit; private Properties props; private String topic; private String group; private static final String OFFSETS_STATE_NAME = "topic-partition-offset-states-with-tags"; private transient volatile boolean restored; private transient boolean enableCheckpoint; public RocketMQSourceWithTag(TagKeyValueDeserializationSchema schema, Properties props) { this.schema = schema; this.props = props; } @Override public void open(Configuration parameters) throws Exception { LOG.debug("source open...."); Validate.notEmpty(props, "Consumer properties can not be empty"); Validate.notNull(schema, "TagKeyValueDeserializationSchema can not be null"); this.topic = props.getProperty(RocketMQConfig.CONSUMER_TOPIC); this.group = props.getProperty(RocketMQConfig.CONSUMER_GROUP); Validate.notEmpty(topic, "Consumer topic can not be empty"); Validate.notEmpty(group, "Consumer group can not be empty"); this.enableCheckpoint = ((StreamingRuntimeContext) getRuntimeContext()).isCheckpointingEnabled(); if (offsetTable == null) { offsetTable = new ConcurrentHashMap<>(); } if (restoredOffsets == null) { restoredOffsets = new ConcurrentHashMap<>(); } if (pendingOffsetsToCommit == null) { pendingOffsetsToCommit = new LinkedMap(); } runningChecker = new RunningChecker(); //Wait for lite pull consumer pullConsumerScheduleService = new MQPullConsumerScheduleService(group, RocketMQConfig.buildAclRPCHook(props)); consumer = pullConsumerScheduleService.getDefaultMQPullConsumer(); consumer.setInstanceName(getRuntimeContext().getIndexOfThisSubtask() + "_" + UUID.randomUUID()); RocketMQConfig.buildConsumerConfigs(props, consumer); } @Override public void run(SourceContext context) throws Exception { LOG.debug("source run...."); // The lock that guarantees that record emission and state updates are atomic, // from the view of taking a checkpoint. final Object lock = context.getCheckpointLock(); int delayWhenMessageNotFound = getInteger(props, RocketMQConfig.CONSUMER_DELAY_WHEN_MESSAGE_NOT_FOUND, RocketMQConfig.DEFAULT_CONSUMER_DELAY_WHEN_MESSAGE_NOT_FOUND); String tag = props.getProperty(RocketMQConfig.CONSUMER_TAG, RocketMQConfig.DEFAULT_CONSUMER_TAG); int pullPoolSize = getInteger(props, RocketMQConfig.CONSUMER_PULL_POOL_SIZE, RocketMQConfig.DEFAULT_CONSUMER_PULL_POOL_SIZE); int pullBatchSize = getInteger(props, RocketMQConfig.CONSUMER_BATCH_SIZE, RocketMQConfig.DEFAULT_CONSUMER_BATCH_SIZE); pullConsumerScheduleService.setPullThreadNums(pullPoolSize); pullConsumerScheduleService.registerPullTaskCallback(topic, (mq, pullTaskContext) -> { try { long offset = getMessageQueueOffset(mq); if (offset < 0) { return; } Log.debug("Current pullBatchSize is: " + pullBatchSize); PullResult pullResult = consumer.pull(mq, tag, offset, pullBatchSize); boolean found = false; switch (pullResult.getPullStatus()) { case FOUND: List messages = pullResult.getMsgFoundList(); if (pullBatchSize != messages.size()) LOG.debug("Pull from rocketmq records is: {}", messages.size()); for (MessageExt msg : messages) { byte[] tag1 = msg.getTags() != null ? msg.getTags().getBytes(StandardCharsets.UTF_8) : null; byte[] key = msg.getKeys() != null ? msg.getKeys().getBytes(StandardCharsets.UTF_8) : null; byte[] value = msg.getBody(); OUT data = schema.deserializeTagKeyAndValue(tag1, key, value); // output and state update are atomic synchronized (lock) { context.collectWithTimestamp(data, msg.getBornTimestamp()); } } found = true; break; case NO_MATCHED_MSG: LOG.debug("No matched message after offset {} for queue {}", offset, mq); break; case NO_NEW_MSG: break; case OFFSET_ILLEGAL: LOG.warn("Offset {} is illegal for queue {}", offset, mq); break; default: break; } synchronized (lock) { putMessageQueueOffset(mq, pullResult.getNextBeginOffset()); } if (found) { pullTaskContext.setPullNextDelayTimeMillis(0); // no delay when messages were found } else { pullTaskContext.setPullNextDelayTimeMillis(delayWhenMessageNotFound); } } catch (Exception e) { throw new RuntimeException(e); } }); try { pullConsumerScheduleService.start(); } catch (MQClientException e) { throw new RuntimeException(e); } runningChecker.setRunning(true); awaitTermination(); } private void awaitTermination() throws InterruptedException { while (runningChecker.isRunning()) { Thread.sleep(50); } } private long getMessageQueueOffset(MessageQueue mq) throws MQClientException { Long offset = offsetTable.get(mq); // restoredOffsets(unionOffsetStates) is the restored global union state; // should only snapshot mqs that actually belong to us if (restored && offset == null) { offset = restoredOffsets.get(mq); } if (offset == null) { LOG.debug("从状态中获取Offset列表为空,将从server端获取offset列表"); offset = consumer.fetchConsumeOffset(mq, true); if (offset < 0) { String initialOffset = props.getProperty(RocketMQConfig.CONSUMER_OFFSET_RESET_TO, CONSUMER_OFFSET_LATEST); switch (initialOffset) { case CONSUMER_OFFSET_EARLIEST: offset = consumer.minOffset(mq); break; case CONSUMER_OFFSET_LATEST: offset = consumer.maxOffset(mq); break; case CONSUMER_OFFSET_TIMESTAMP: offset = consumer.searchOffset(mq, getLong(props, RocketMQConfig.CONSUMER_OFFSET_FROM_TIMESTAMP, System.currentTimeMillis())); break; default: throw new IllegalArgumentException("Unknown value for CONSUMER_OFFSET_RESET_TO."); } } } offsetTable.put(mq, offset); return offsetTable.get(mq); } private void putMessageQueueOffset(MessageQueue mq, long offset) throws MQClientException, RemotingException, InterruptedException, MQBrokerException { offsetTable.put(mq, offset); if (!enableCheckpoint) { consumer.updateConsumeOffset(mq, offset); // consumer.getOffsetStore().updateConsumeOffsetToBroker(mq,offset,true); } } @Override public void cancel() { LOG.debug("cancel ..."); runningChecker.setRunning(false); if (pullConsumerScheduleService != null) { pullConsumerScheduleService.shutdown(); } if (offsetTable != null) { offsetTable.clear(); } if (restoredOffsets != null) { restoredOffsets.clear(); } if (pendingOffsetsToCommit != null) { pendingOffsetsToCommit.clear(); } } @Override public void close() throws Exception { LOG.debug("close ..."); // pretty much the same logic as cancelling try { cancel(); } finally { super.close(); } } @Override public void snapshotState(FunctionSnapshotContext context) throws Exception { // called when a snapshot for a checkpoint is requested if (!runningChecker.isRunning()) { LOG.debug("snapshotState() called on closed source; returning null."); return; } if (LOG.isDebugEnabled()) { LOG.debug("Snapshotting state {} ...", context.getCheckpointId()); } unionOffsetStates.clear(); HashMap currentOffsets = new HashMap<>(offsetTable.size()); // remove the unassigned queues in order to avoid read the wrong offset when the source restart Set assignedQueues = consumer.fetchMessageQueuesInBalance(topic); offsetTable.entrySet().removeIf(item -> !assignedQueues.contains(item.getKey())); for (Map.Entry entry : offsetTable.entrySet()) { unionOffsetStates.add(Tuple2.of(entry.getKey(), entry.getValue())); currentOffsets.put(entry.getKey(), entry.getValue()); } pendingOffsetsToCommit.put(context.getCheckpointId(), currentOffsets); if (LOG.isDebugEnabled()) { LOG.debug("Snapshotted state, last processed offsets: {}, checkpoint id: {}, timestamp: {}", offsetTable, context.getCheckpointId(), context.getCheckpointTimestamp()); } } @Override public void initializeState(FunctionInitializationContext context) throws Exception { // called every time the user-defined function is initialized, // be that when the function is first initialized or be that // when the function is actually recovering from an earlier checkpoint. // Given this, initializeState() is not only the place where different types of state are initialized, // but also where state recovery logic is included. LOG.debug("initialize State ..."); this.unionOffsetStates = context.getOperatorStateStore().getUnionListState(new ListStateDescriptor<>( OFFSETS_STATE_NAME, TypeInformation.of(new TypeHint>() {}))); this.restored = context.isRestored(); if (restored) { if (restoredOffsets == null) { restoredOffsets = new ConcurrentHashMap<>(); } for (Tuple2 mqOffsets : unionOffsetStates.get()) { if (!restoredOffsets.containsKey(mqOffsets.f0) || restoredOffsets.get(mqOffsets.f0) < mqOffsets.f1) { restoredOffsets.put(mqOffsets.f0, mqOffsets.f1); } } LOG.info("Setting restore state in the consumer. Using the following offsets: {}", restoredOffsets); } else { LOG.info("No restore state for the consumer."); } } @Override public TypeInformation getProducedType() { return schema.getProducedType(); } @Override public void notifyCheckpointComplete(long checkpointId) throws Exception { // callback when checkpoint complete if (!runningChecker.isRunning()) { LOG.debug("notifyCheckpointComplete() called on closed source; returning null."); return; } final int posInMap = pendingOffsetsToCommit.indexOf(checkpointId); if (posInMap == -1) { LOG.warn("Received confirmation for unknown checkpoint id {}", checkpointId); return; } Map offsets = (Map) pendingOffsetsToCommit.remove(posInMap); // remove older checkpoints in map for (int i = 0; i < posInMap; i++) { pendingOffsetsToCommit.remove(0); } if (offsets == null || offsets.size() == 0) { LOG.debug("Checkpoint state was empty."); return; } for (Map.Entry entry : offsets.entrySet()) { consumer.updateConsumeOffset(entry.getKey(), entry.getValue()); } } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java-flink-1.14/org/apache/rocketmq/flink/common/serialization/JsonDeserializationSchema.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink.common.serialization; import org.apache.flink.api.common.serialization.DeserializationSchema; import org.apache.flink.api.common.typeinfo.TypeHint; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.table.data.RowData; import java.io.IOException; /** * 将rocketmq消息反序列化成RowData * @author ChengLong 2021-5-9 13:40:17 */ public class JsonDeserializationSchema implements TagKeyValueDeserializationSchema { private DeserializationSchema key; private DeserializationSchema value; public JsonDeserializationSchema(DeserializationSchema key, DeserializationSchema value) { this.key = key; this.value = value; } @Override public RowData deserializeTagKeyAndValue(byte[] tag, byte[] key, byte[] value) { /*String keyString = key != null ? new String(key, StandardCharsets.UTF_8) : null; String valueString = value != null ? new String(value, StandardCharsets.UTF_8) : null;*/ if (value != null) { try { // 调用sql connector的format进行反序列化 return this.value.deserialize(value); } catch (IOException e) { e.printStackTrace(); } } return null; } @Override public TypeInformation getProducedType() { return TypeInformation.of(new TypeHint(){}); } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java-flink-1.14/org/apache/rocketmq/flink/common/serialization/SimpleTagKeyValueDeserializationSchema.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink.common.serialization; import org.apache.flink.api.common.typeinfo.TypeHint; import org.apache.flink.api.common.typeinfo.TypeInformation; import scala.Tuple3; import java.nio.charset.StandardCharsets; /** * 反序列化MessageExt,将tag、key、value以tuple3方式返回 * * @author ChengLong 2021-5-10 09:44:55 */ public class SimpleTagKeyValueDeserializationSchema implements TagKeyValueDeserializationSchema> { @Override public Tuple3 deserializeTagKeyAndValue(byte[] tag, byte[] key, byte[] value) { String tagString = tag != null ? new String(tag, StandardCharsets.UTF_8) : null; String keyString = key != null ? new String(key, StandardCharsets.UTF_8) : null; String valueString = value != null ? new String(value, StandardCharsets.UTF_8) : null; return new Tuple3<>(tagString, keyString, valueString); } @Override public TypeInformation> getProducedType() { return TypeInformation.of(new TypeHint>(){}); } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/java-flink-1.14/org/apache/rocketmq/flink/common/serialization/TagKeyValueDeserializationSchema.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.flink.common.serialization; import org.apache.flink.api.java.typeutils.ResultTypeQueryable; import java.io.Serializable; /** * 反序列化,携带tag信息 * @author ChengLong 2021-5-10 09:43:35 */ public interface TagKeyValueDeserializationSchema extends ResultTypeQueryable, Serializable { T deserializeTagKeyAndValue(byte[] tag, byte[] key, byte[] value); } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory ================================================ com.zto.fire.flink.sql.connector.rocketmq.RocketMQDynamicTableFactory ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/scala/com/zto/fire/flink/sql/connector/rocketmq/RocketMQDynamicTableFactory.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.sql.connector.rocketmq import com.zto.fire.flink.sql.connector.rocketmq.RocketMQOptions._ import org.apache.flink.api.common.serialization.{DeserializationSchema, SerializationSchema} import org.apache.flink.configuration.ConfigOption import org.apache.flink.table.connector.format.{DecodingFormat, EncodingFormat} import org.apache.flink.table.connector.sink.DynamicTableSink import org.apache.flink.table.connector.source.DynamicTableSource import org.apache.flink.table.data.RowData import org.apache.flink.table.factories._ import com.zto.fire.predef._ /** * sql connector的source与sink创建工厂 * * @author ChengLong 2021-5-7 15:48:03 */ class RocketMQDynamicTableFactory extends DynamicTableSourceFactory with DynamicTableSinkFactory { val IDENTIFIER = "fire-rocketmq" override def factoryIdentifier(): String = this.IDENTIFIER private def getKeyDecodingFormat(helper: FactoryUtil.TableFactoryHelper): DecodingFormat[DeserializationSchema[RowData]] = { helper.discoverDecodingFormat(classOf[DeserializationFormatFactory], FactoryUtil.FORMAT) } private def getValueDecodingFormat(helper: FactoryUtil.TableFactoryHelper): DecodingFormat[DeserializationSchema[RowData]] = { helper.discoverDecodingFormat(classOf[DeserializationFormatFactory], FactoryUtil.FORMAT) } private def getKeyEncodingFormat(helper: FactoryUtil.TableFactoryHelper): EncodingFormat[SerializationSchema[RowData]] = { helper.discoverEncodingFormat(classOf[SerializationFormatFactory], FactoryUtil.FORMAT) } private def getValueEncodingFormat(helper: FactoryUtil.TableFactoryHelper): EncodingFormat[SerializationSchema[RowData]] = { helper.discoverEncodingFormat(classOf[SerializationFormatFactory], FactoryUtil.FORMAT) } /** * 必填参数列表 */ override def requiredOptions(): JSet[ConfigOption[_]] = { val set = new JHashSet[ConfigOption[_]] set.add(TOPIC) set.add(PROPS_BOOTSTRAP_SERVERS) set.add(PROPS_GROUP_ID) set } /** * 可选的参数列表 */ override def optionalOptions(): JSet[ConfigOption[_]] = { val optionalOptions = new JHashSet[ConfigOption[_]] optionalOptions } /** * 创建rocketmq table source */ override def createDynamicTableSource(context: DynamicTableFactory.Context): DynamicTableSource = { val helper = FactoryUtil.createTableFactoryHelper(this, context) val tableOptions = helper.getOptions val keyDecodingFormat = this.getKeyDecodingFormat(helper) val valueDecodingFormat = this.getValueDecodingFormat(helper) val withOptions = context.getCatalogTable.getOptions val physicalDataType = context.getCatalogTable.getSchema.toPhysicalRowDataType val keyProjection = createKeyFormatProjection(tableOptions, physicalDataType) val valueProjection = createValueFormatProjection(tableOptions, physicalDataType) val keyPrefix = tableOptions.getOptional(KEY_FIELDS_PREFIX).orElse(null) new RocketMQDynamicTableSource(physicalDataType, keyDecodingFormat, valueDecodingFormat, keyProjection, valueProjection, keyPrefix, withOptions) } /** * 创建rocketmq table sink */ override def createDynamicTableSink(context: DynamicTableFactory.Context): DynamicTableSink = { val helper = FactoryUtil.createTableFactoryHelper(this, context) val tableOptions = helper.getOptions() val keyDecodingFormat = this.getKeyEncodingFormat(helper) val valueDecodingFormat = this.getValueEncodingFormat(helper) val physicalDataType = context.getCatalogTable().getSchema().toPhysicalRowDataType() val keyProjection = RocketMQOptions.createKeyFormatProjection(tableOptions, physicalDataType) val valueProjection = RocketMQOptions.createValueFormatProjection(tableOptions, physicalDataType) val keyPrefix = tableOptions.getOptional(RocketMQOptions.KEY_FIELDS_PREFIX).orElse(null) val parallelism = tableOptions.getOptional(FactoryUtil.SINK_PARALLELISM).orElse(8) val withOptions = context.getCatalogTable.getOptions new RocketMQDynamicTableSink(physicalDataType, keyDecodingFormat, valueDecodingFormat, keyProjection, valueProjection, keyPrefix, withOptions) } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/scala/com/zto/fire/flink/sql/connector/rocketmq/RocketMQDynamicTableSink.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.sql.connector.rocketmq import com.zto.fire.common.conf.FireRocketMQConf import com.zto.fire.common.util.LineageManager import com.zto.fire.flink.sql.connector.rocketmq.RocketMQOptions.getRocketMQProperties import com.zto.fire.predef._ import com.zto.fire.common.enu.{Operation => FOperation} import org.apache.flink.api.common.serialization.SerializationSchema import org.apache.flink.table.connector.ChangelogMode import org.apache.flink.table.connector.format.EncodingFormat import org.apache.flink.table.connector.sink.{DynamicTableSink, SinkFunctionProvider} import org.apache.flink.table.data.RowData import org.apache.flink.table.types.DataType import org.apache.flink.table.types.utils.DataTypeUtils import org.apache.rocketmq.flink.common.selector.DefaultTopicSelector import org.apache.rocketmq.flink.common.serialization.JsonSerializationSchema import org.apache.rocketmq.flink.{RocketMQConfig, RocketMQSinkWithTag} /** * 定义source table * * @author ChengLong 2021-5-7 15:48:03 */ class RocketMQDynamicTableSink(physicalDataType: DataType, keyDecodingFormat: EncodingFormat[SerializationSchema[RowData]], valueDecodingFormat: EncodingFormat[SerializationSchema[RowData]], keyProjection: Array[Int], valueProjection: Array[Int], keyPrefix: String, tableOptions: JMap[String, String]) extends DynamicTableSink { override def getChangelogMode(requestedMode: ChangelogMode): ChangelogMode = ChangelogMode.insertOnly() override def getSinkRuntimeProvider(context: DynamicTableSink.Context): DynamicTableSink.SinkRuntimeProvider = { // 获取以rocket.conf.为前缀的配置 val properties = getRocketMQProperties(this.tableOptions) // 获取rocket.brokers.name对应的nameserver地址 val brokerName = tableOptions.get(FireRocketMQConf.ROCKET_BROKERS_NAME) val nameserver = FireRocketMQConf.rocketClusterMap.getOrElse(brokerName, brokerName) if (noEmpty(nameserver)) properties.setProperty(RocketMQConfig.NAME_SERVER_ADDR, nameserver) assert(noEmpty(properties.getProperty(RocketMQConfig.NAME_SERVER_ADDR)), s"""nameserver不能为空,请在with中使用 '${FireRocketMQConf.ROCKET_BROKERS_NAME}'='ip:port' 指定""") // 获取topic信息 val topic = tableOptions.get(FireRocketMQConf.ROCKET_TOPICS) if (noEmpty(topic)) properties.setProperty(RocketMQConfig.CONSUMER_TOPIC, topic) assert(noEmpty(properties.getProperty(RocketMQConfig.CONSUMER_TOPIC)), s"""topic不能为空,请在with中使用 '${FireRocketMQConf.ROCKET_TOPICS}'='topicName' 指定""") // 获取tag信息 val tag = tableOptions.get(FireRocketMQConf.ROCKET_CONSUMER_TAG) if (noEmpty(tag)) properties.setProperty(RocketMQConfig.CONSUMER_TAG, tag) else properties.setProperty(RocketMQConfig.CONSUMER_TAG, "*") // sink的并行度 val sinkParallelism = tableOptions.getOrElse(FireRocketMQConf.ROCKET_SINK_PARALLELISM, null) // 消费rocketmq埋点信息 LineageManager.addMQDatasource("rocketmq", nameserver, topic, "", FOperation.SINK) val keyDeserialization = createSerialization(context, keyDecodingFormat, keyProjection, keyPrefix) val valueDeserialization = createSerialization(context, valueDecodingFormat, valueProjection, null) val sink = new RocketMQSinkWithTag[RowData](new JsonSerializationSchema(topic, tag, valueDeserialization), new DefaultTopicSelector(topic), properties) SinkFunctionProvider.of(sink, if (noEmpty(sinkParallelism)) sinkParallelism.trim.toInt else null) } override def copy(): DynamicTableSink = new RocketMQDynamicTableSink(physicalDataType, keyDecodingFormat, valueDecodingFormat, keyProjection, valueProjection, keyPrefix, tableOptions) override def asSummaryString(): String = "fire-rocketmq sink" /** * 创建反序列化器 */ def createSerialization(context: DynamicTableSink.Context, format: EncodingFormat[SerializationSchema[RowData]], projection: Array[Int], prefix: String): SerializationSchema[RowData] = { if (format == null) return null var physicalFormatDataType = DataTypeUtils.projectRow(this.physicalDataType, projection) if (noEmpty(prefix)) { physicalFormatDataType = DataTypeUtils.stripRowPrefix(physicalFormatDataType, prefix) } format.createRuntimeEncoder(context, physicalFormatDataType) } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/scala/com/zto/fire/flink/sql/connector/rocketmq/RocketMQDynamicTableSource.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.sql.connector.rocketmq import com.zto.fire.common.conf.FireRocketMQConf import com.zto.fire.common.util.LineageManager import com.zto.fire.flink.sql.connector.rocketmq.RocketMQOptions.getRocketMQProperties import com.zto.fire.predef._ import com.zto.fire.common.enu.{Operation => FOperation} import org.apache.flink.api.common.serialization.DeserializationSchema import org.apache.flink.table.connector.ChangelogMode import org.apache.flink.table.connector.format.DecodingFormat import org.apache.flink.table.connector.source.{DynamicTableSource, ScanTableSource, SourceFunctionProvider} import org.apache.flink.table.data.RowData import org.apache.flink.table.types.DataType import org.apache.flink.table.types.utils.DataTypeUtils import org.apache.rocketmq.flink.common.serialization.JsonDeserializationSchema import org.apache.rocketmq.flink.{RocketMQConfig, RocketMQSourceWithTag} /** * 定义source table * * @author ChengLong 2021-5-7 15:48:03 */ class RocketMQDynamicTableSource(physicalDataType: DataType, keyDecodingFormat: DecodingFormat[DeserializationSchema[RowData]], valueDecodingFormat: DecodingFormat[DeserializationSchema[RowData]], keyProjection: Array[Int], valueProjection: Array[Int], keyPrefix: String, tableOptions: JMap[String, String]) extends ScanTableSource { override def getChangelogMode: ChangelogMode = ChangelogMode.insertOnly() override def copy(): DynamicTableSource = new RocketMQDynamicTableSource(physicalDataType, keyDecodingFormat, valueDecodingFormat, keyProjection, valueProjection, keyPrefix, tableOptions) override def asSummaryString(): String = "fire-rocketmq source" /** * 创建反序列化器 */ def createDeserialization(context: DynamicTableSource.Context, format: DecodingFormat[DeserializationSchema[RowData]], projection: Array[Int], prefix: String): DeserializationSchema[RowData] = { if (format == null) return null var physicalFormatDataType = DataTypeUtils.projectRow(this.physicalDataType, projection) if (noEmpty(prefix)) { physicalFormatDataType = DataTypeUtils.stripRowPrefix(physicalFormatDataType, prefix) } format.createRuntimeDecoder(context, physicalFormatDataType) } /** * 消费rocketmq中的数据,并反序列化为RowData对象实例 */ override def getScanRuntimeProvider(context: ScanTableSource.ScanContext): ScanTableSource.ScanRuntimeProvider = { // 获取以rocket.conf.为前缀的配置 val properties = getRocketMQProperties(this.tableOptions) // 获取rocket.brokers.name对应的nameserver地址 val brokerName = tableOptions.get(FireRocketMQConf.ROCKET_BROKERS_NAME) val nameserver = FireRocketMQConf.rocketClusterMap.getOrElse(brokerName, brokerName) if (noEmpty(nameserver)) properties.setProperty(RocketMQConfig.NAME_SERVER_ADDR, nameserver) assert(noEmpty(properties.getProperty(RocketMQConfig.NAME_SERVER_ADDR)), s"""nameserver不能为空,请在with中使用 '${FireRocketMQConf.ROCKET_BROKERS_NAME}'='ip:port' 指定""") // 获取topic信息 val topic = tableOptions.get(FireRocketMQConf.ROCKET_TOPICS) if (noEmpty(topic)) properties.setProperty(RocketMQConfig.CONSUMER_TOPIC, topic) assert(noEmpty(properties.getProperty(RocketMQConfig.CONSUMER_TOPIC)), s"""topic不能为空,请在with中使用 '${FireRocketMQConf.ROCKET_TOPICS}'='topicName' 指定""") // 获取groupId信息 val groupId = tableOptions.get(FireRocketMQConf.ROCKET_GROUP_ID) if (noEmpty(groupId)) properties.setProperty(RocketMQConfig.CONSUMER_GROUP, groupId) assert(noEmpty(properties.getProperty(RocketMQConfig.CONSUMER_GROUP)), s"""group.id不能为空,请在with中使用 '${FireRocketMQConf.ROCKET_GROUP_ID}'='groupId' 指定""") // 获取tag信息 val tag = tableOptions.get(FireRocketMQConf.ROCKET_CONSUMER_TAG) if (noEmpty(tag)) properties.setProperty(RocketMQConfig.CONSUMER_TAG, tag) else properties.setProperty(RocketMQConfig.CONSUMER_TAG, "*") // 获取起始消费位点 val startOffset = tableOptions.get(FireRocketMQConf.ROCKET_STARTING_OFFSET) if (noEmpty(startOffset)) properties.setProperty(RocketMQConfig.CONSUMER_OFFSET_RESET_TO, startOffset) // 消费rocketmq埋点信息 LineageManager.addMQDatasource("rocketmq", nameserver, topic, groupId, FOperation.SOURCE) val keyDeserialization = createDeserialization(context, keyDecodingFormat, keyProjection, keyPrefix) val valueDeserialization = createDeserialization(context, valueDecodingFormat, valueProjection, null) SourceFunctionProvider.of(new RocketMQSourceWithTag(new JsonDeserializationSchema(keyDeserialization, valueDeserialization), properties), false) } } ================================================ FILE: fire-connectors/flink-connectors/flink-rocketmq/src/main/scala/com/zto/fire/flink/sql/connector/rocketmq/RocketMQOptions.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.sql.connector.rocketmq import com.zto.fire.flink.sql.connector.rocketmq.RocketMQOptions.ValueFieldsStrategy.ValueFieldsStrategy import org.apache.flink.configuration.{ConfigOption, ConfigOptions, ReadableConfig} import org.apache.flink.table.api.{TableException, ValidationException} import org.apache.flink.table.types.DataType import org.apache.flink.table.types.logical.utils.LogicalTypeChecks import java.util import java.util.Properties import java.util.stream.IntStream import com.zto.fire.predef._ import scala.collection.{JavaConversions, JavaConverters} /** * RocketMQ connector支持的with参数 * * @author ChengLong 2021-5-7 15:48:03 */ object RocketMQOptions { val PROPERTIES_PREFIX = "rocket.conf." val TOPIC: ConfigOption[String] = ConfigOptions .key("topic") .stringType .noDefaultValue .withDescription("Topic names from which the table is read. Either 'topic' or 'topic-pattern' must be set for source. Option 'topic' is required for sink.") val PROPS_BOOTSTRAP_SERVERS: ConfigOption[String] = ConfigOptions .key("properties.bootstrap.servers") .stringType .noDefaultValue .withDescription("Required RocketMQ server connection string") val PROPS_GROUP_ID: ConfigOption[String] = ConfigOptions .key("properties.group.id") .stringType.noDefaultValue .withDescription("Required consumer group in RocketMQ consumer, no need for v producer.") val KEY_FIELDS_PREFIX: ConfigOption[String] = ConfigOptions.key("key.fields-prefix") .stringType() .noDefaultValue() .withDescription( s""" |Defines a custom prefix for all fields of the key format to avoid name clashes with fields of the value format. |By default, the prefix is empty. If a custom prefix is defined, both the table schema and '${ValueFieldsStrategy.ALL}' |will work with prefixed names. When constructing the data type of the key format, the prefix will be removed and the |non-prefixed names will be used within the key format. Please note that this option requires that must be '${ValueFieldsStrategy.EXCEPT_KEY}'. |""".stripMargin) val KEY_FIELDS: ConfigOption[JList[String]] = ConfigOptions.key("key.fields") .stringType() .asList() .defaultValues() .withDescription( """ |Defines an explicit list of physical columns from the table schema that configure the data type for the key format. |By default, this list is empty and thus a key is undefined. |""".stripMargin) val VALUE_FIELDS_INCLUDE: ConfigOption[ValueFieldsStrategy] = ConfigOptions.key("value.fields-include") .defaultValue(ValueFieldsStrategy.ALL) .withDescription( """ |Defines a strategy how to deal with key columns in the data type of |the value format. By default, 'ValueFieldsStrategy.ALL' physical |columns of the table schema will be included in the value format which |means that key columns appear in the data type for both the key and value format. |""".stripMargin) val FORMAT_SUFFIX = ".format" val KEY_FORMAT: ConfigOption[String] = ConfigOptions.key("key" + FORMAT_SUFFIX) .stringType() .noDefaultValue() .withDescription("Defines the format identifier for encoding key data. The identifier is used to discover a suitable format factory.") val VALUE_FORMAT: ConfigOption[String] = ConfigOptions.key("value" + FORMAT_SUFFIX) .stringType() .noDefaultValue() .withDescription("Defines the format identifier for encoding value data. The identifier is used to discover a suitable format factory.") object ValueFieldsStrategy extends Enumeration { type ValueFieldsStrategy = Value val ALL, EXCEPT_KEY = Value } def createKeyFormatProjection(options: ReadableConfig, physicalDataType: DataType): Array[Int] = { val physicalType = physicalDataType.getLogicalType val optionalKeyFormat = options.getOptional(RocketMQOptions.KEY_FORMAT) val optionalKeyFields = options.getOptional(RocketMQOptions.KEY_FIELDS) if (!optionalKeyFormat.isPresent && optionalKeyFields.isPresent) { throw new ValidationException(s"The option '${RocketMQOptions.KEY_FIELDS.key}' can only be declared if a key format is defined using '${RocketMQOptions.KEY_FORMAT.key}'.") } else if (optionalKeyFormat.isPresent && (!optionalKeyFields.isPresent || optionalKeyFields.get.size == 0)) { throw new ValidationException(s"A key format '${RocketMQOptions.KEY_FORMAT.key}' requires the declaration of one or more of key fields using '${RocketMQOptions.KEY_FIELDS.key}'.") } if (!optionalKeyFormat.isPresent) return new Array[Int](0) val keyPrefix = options.getOptional(RocketMQOptions.KEY_FIELDS_PREFIX).orElse("") val keyFields = JavaConversions.asScalaBuffer(optionalKeyFields.get) val physicalFields = LogicalTypeChecks.getFieldNames(physicalType) keyFields.map((keyField: String) => { def foo(keyField: String): Int = { val pos = physicalFields.indexOf(keyField) // check that field name exists if (pos < 0) throw new ValidationException(s"Could not find the field '${keyField}' in the table schema for usage in the key format. A key field must be a regular, physical column. The following columns can be selected in the '${RocketMQOptions.KEY_FIELDS.key}' option:\n${physicalFields}") // check that field name is prefixed correctly if (!keyField.startsWith(keyPrefix)) throw new ValidationException(s"All fields in '${RocketMQOptions.KEY_FIELDS.key}' must be prefixed with '${keyPrefix}' when option '${RocketMQOptions.KEY_FIELDS_PREFIX.key}' is set but field '${keyField}' is not prefixed.") pos } foo(keyField) }).toArray } def createValueFormatProjection(options: ReadableConfig, physicalDataType: DataType): Array[Int] = { val physicalType = physicalDataType.getLogicalType val physicalFieldCount = LogicalTypeChecks.getFieldCount(physicalType) // val physicalFields = IntStream.range(0, physicalFieldCount) val physicalFields = (0 until physicalFieldCount).toArray val keyPrefix = options.getOptional(KEY_FIELDS_PREFIX).orElse("") val strategy = options.get(VALUE_FIELDS_INCLUDE); if (strategy == ValueFieldsStrategy.ALL) { if (keyPrefix.nonEmpty) { throw new ValidationException(s"A key prefix is not allowed when option '${VALUE_FIELDS_INCLUDE.key()}' is set to '${ValueFieldsStrategy.ALL}'. Set it to '${ValueFieldsStrategy.EXCEPT_KEY}' instead to avoid field overlaps.") } return physicalFields } else if (strategy == ValueFieldsStrategy.EXCEPT_KEY) { val keyProjection = createKeyFormatProjection(options, physicalDataType); return physicalFields.filter(pos => !keyProjection.contains(pos)) } throw new TableException(s"Unknown value fields strategy:$strategy"); } /** * 是否存在以properties.开头的参数 */ private def hasRocketMQClientProperties(tableOptions: util.Map[String, String]) = { JavaConversions.mapAsScalaMap(tableOptions) .keySet .filter((k: String) => k.startsWith(PROPERTIES_PREFIX)).size > 0 } /** * 获取以rocket.conf.开头的所有的参数 */ def getRocketMQProperties(tableOptions: util.Map[String, String]): Properties = { val rocketMQProperties = new Properties if (hasRocketMQClientProperties(tableOptions)) JavaConversions.mapAsScalaMap(tableOptions).keySet.filter((key: String) => key.startsWith(PROPERTIES_PREFIX)).foreach((key: String) => { def foo(key: String): Unit = { val value = tableOptions.get(key) val subKey = key.substring(PROPERTIES_PREFIX.length) rocketMQProperties.put(subKey, value) } foo(key) }) rocketMQProperties } } ================================================ FILE: fire-connectors/flink-connectors/pom.xml ================================================ 4.0.0 fire-flink-connectors pom Fire : Connectors : Fink : flink-clickhouse flink-rocketmq flink-es fire-connectors com.zto.fire 2.3.2-SNAPSHOT ../pom.xml scala-tools.org Scala-Tools Maven2 Repository http://scala-tools.org/repo-releases scala-tools.org Scala-Tools Maven2 Repository http://scala-tools.org/repo-releases org.specs specs 1.2.5 test src/main/scala src/test/scala org.scala-tools maven-scala-plugin compile testCompile ${scala.version} -target:jvm-1.5 org.apache.maven.plugins maven-eclipse-plugin true ch.epfl.lamp.sdt.core.scalabuilder ch.epfl.lamp.sdt.core.scalanature org.eclipse.jdt.launching.JRE_CONTAINER ch.epfl.lamp.sdt.launching.SCALA_CONTAINER org.scala-tools maven-scala-plugin ${scala.version} ================================================ FILE: fire-connectors/pom.xml ================================================ 4.0.0 fire-connectors pom Fire : Connectors : com.zto.fire fire-parent 2.3.2-SNAPSHOT ../pom.xml base-connectors spark-connectors flink-connectors com.zto.fire fire-common_${scala.binary.version} ${fire.version} ${maven.scope} com.zto.fire fire-core_${scala.binary.version} ${fire.version} ${maven.scope} com.zto.fire fire-metrics_${scala.binary.version} ${fire.version} ${maven.scope} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-connectors/spark-connectors/pom.xml ================================================ 4.0.0 fire-spark-connectors pom Fire : Connectors : Spark : spark-hbase spark-rocketmq fire-connectors com.zto.fire 2.3.2-SNAPSHOT ../pom.xml scala-tools.org Scala-Tools Maven2 Repository http://scala-tools.org/repo-releases scala-tools.org Scala-Tools Maven2 Repository http://scala-tools.org/repo-releases org.specs specs 1.2.5 test src/main/scala src/test/scala org.scala-tools maven-scala-plugin compile testCompile ${scala.version} -target:jvm-1.5 org.apache.maven.plugins maven-eclipse-plugin true ch.epfl.lamp.sdt.core.scalabuilder ch.epfl.lamp.sdt.core.scalanature org.eclipse.jdt.launching.JRE_CONTAINER ch.epfl.lamp.sdt.launching.SCALA_CONTAINER org.scala-tools maven-scala-plugin ${scala.version} ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/pom.xml ================================================ 4.0.0 fire-connector-spark-hbase_${spark.reference} jar Fire : Connectors : Spark : HBase fire-spark-connectors com.zto.fire 2.3.2-SNAPSHOT com.zto.fire fire-enhance-spark_${spark.reference} ${fire.version} ${maven.scope} org.apache.spark spark-core_${scala.binary.version} ${spark.version} provided org.scala-lang scala-library org.scala-lang scalap com.google.code.findbugs jsr305 javax.servlet servlet-api com.google.code.findbugs jsr305 1.3.9 provided true org.apache.spark spark-sql_${scala.binary.version} ${spark.version} provided org.apache.spark spark-streaming_${scala.binary.version} ${spark.version} provided org.apache.spark spark-streaming_${scala.binary.version} ${spark.version} test-jar tests test org.apache.hadoop hadoop-client ${hadoop.version} ${maven.scope} log4j log4j javax.servlet servlet-api javax.servlet.jsp jsp-api org.jruby jruby-complete org.jboss.netty netty io.netty netty org.apache.hadoop hadoop-common ${hadoop.version} ${maven.scope} log4j log4j javax.servlet servlet-api javax.servlet.jsp jsp-api org.jruby jruby-complete org.jboss.netty netty io.netty netty com.google.code.findbugs jsr305 org.apache.hadoop hadoop-common ${hadoop.version} test-jar test log4j log4j javax.servlet servlet-api javax.servlet.jsp jsp-api org.jruby jruby-complete org.jboss.netty netty io.netty netty com.google.code.findbugs jsr305 org.apache.hadoop hadoop-hdfs ${hadoop.version} test-jar test log4j log4j javax.servlet servlet-api javax.servlet.jsp jsp-api org.jruby jruby-complete org.jboss.netty netty io.netty netty com.google.protobuf protobuf-java 2.5.0 ${maven.scope} org.apache.commons commons-lang3 3.5 ${maven.scope} org.apache.hbase hbase-common ${hbase.version} ${maven.scope} org.apache.hbase hbase-client ${hbase.version} ${maven.scope} log4j log4j org.apache.thrift thrift org.jruby jruby-complete org.slf4j slf4j-log4j12 org.mortbay.jetty jsp-2.1 org.mortbay.jetty jsp-api-2.1 org.mortbay.jetty servlet-api-2.5 com.sun.jersey jersey-core com.sun.jersey jersey-json com.sun.jersey jersey-server org.mortbay.jetty jetty org.mortbay.jetty jetty-util tomcat jasper-runtime tomcat jasper-compiler org.jruby jruby-complete org.jboss.netty netty io.netty netty org.apache.hbase hbase-protocol ${hbase.version} ${maven.scope} org.apache.hbase hbase-annotations ${hbase.version} test-jar test org.apache.hbase hbase-hadoop-compat ${hbase.version} test test-jar log4j log4j org.apache.thrift thrift org.jruby jruby-complete org.slf4j slf4j-log4j12 org.mortbay.jetty jsp-2.1 org.mortbay.jetty jsp-api-2.1 org.mortbay.jetty servlet-api-2.5 com.sun.jersey jersey-core com.sun.jersey jersey-json com.sun.jersey jersey-server org.mortbay.jetty jetty org.mortbay.jetty jetty-util tomcat jasper-runtime tomcat jasper-compiler org.jruby jruby-complete org.jboss.netty netty io.netty netty org.apache.hbase hbase-hadoop2-compat ${hbase.version} test test-jar log4j log4j org.apache.thrift thrift org.jruby jruby-complete org.slf4j slf4j-log4j12 org.mortbay.jetty jsp-2.1 org.mortbay.jetty jsp-api-2.1 org.mortbay.jetty servlet-api-2.5 com.sun.jersey jersey-core com.sun.jersey jersey-json com.sun.jersey jersey-server org.mortbay.jetty jetty org.mortbay.jetty jetty-util tomcat jasper-runtime tomcat jasper-compiler org.jruby jruby-complete org.jboss.netty netty io.netty netty org.apache.hbase hbase-server ${hbase.version} ${maven.scope} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/java/org/apache/hadoop/hbase/client/ConnFactoryExtend.java ================================================ /** * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.client; import java.io.Serializable; public class ConnFactoryExtend extends ConnectionFactory implements Serializable { public ConnFactoryExtend() { } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/java/org/apache/hadoop/hbase/client/ConnectionFactory.java ================================================ /** * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.client; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.classification.InterfaceStability; import org.apache.hadoop.hbase.security.User; import org.apache.hadoop.hbase.security.UserProvider; import java.io.IOException; import java.lang.reflect.Constructor; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.Map; import java.util.Set; import java.util.concurrent.ExecutorService; import static org.apache.hadoop.hbase.client.ConnectionManager.MAX_CACHED_CONNECTION_INSTANCES; /** * A non-instantiable class that manages creation of {@link Connection}s. * Managing the lifecycle of the {@link Connection}s to the cluster is the responsibility of * the caller. * From a {@link Connection}, {@link Table} implementations are retrieved * with {@link Connection#getTable(TableName)}. Example: *
 * Connection connection = ConnectionFactory.createConnection(config);
 * Table table = connection.getTable(TableName.valueOf("table1"));
 * try {
 *   // Use the table as needed, for a single operation and a single thread
 * } finally {
 *   table.close();
 *   connection.close();
 * }
 * 
* * Similarly, {@link Connection} also returns {@link Admin} and {@link RegionLocator} * implementations. * * This class replaces {@link HConnectionManager}, which is now deprecated. * @see Connection * @since 0.99.0 */ @InterfaceAudience.Public @InterfaceStability.Evolving public class ConnectionFactory { public static final Log LOG = LogFactory.getLog(ConnectionFactory.class); /** No public c.tors */ protected ConnectionFactory() { } /** * Create a new Connection instance using default HBaseConfiguration. Connection * encapsulates all housekeeping for a connection to the cluster. All tables and interfaces * created from returned connection share zookeeper connection, meta cache, and connections * to region servers and masters. *
* The caller is responsible for calling {@link Connection#close()} on the returned * connection instance. * * Typical usage: *
   * Connection connection = ConnectionFactory.createConnection();
   * Table table = connection.getTable(TableName.valueOf("mytable"));
   * try {
   *   table.get(...);
   *   ...
   * } finally {
   *   table.close();
   *   connection.close();
   * }
   * 
* * @return Connection object for conf */ public static Connection createConnection() throws IOException { return createConnection(HBaseConfiguration.create(), null, null); } /** * Create a new Connection instance using the passed conf instance. Connection * encapsulates all housekeeping for a connection to the cluster. All tables and interfaces * created from returned connection share zookeeper connection, meta cache, and connections * to region servers and masters. *
* The caller is responsible for calling {@link Connection#close()} on the returned * connection instance. * * Typical usage: *
   * Connection connection = ConnectionFactory.createConnection(conf);
   * Table table = connection.getTable(TableName.valueOf("mytable"));
   * try {
   *   table.get(...);
   *   ...
   * } finally {
   *   table.close();
   *   connection.close();
   * }
   * 
* * @param conf configuration * @return Connection object for conf */ public static Connection createConnection(Configuration conf) throws IOException { return createConnection(conf, null, null); } /** * Create a new Connection instance using the passed conf instance. Connection * encapsulates all housekeeping for a connection to the cluster. All tables and interfaces * created from returned connection share zookeeper connection, meta cache, and connections * to region servers and masters. *
* The caller is responsible for calling {@link Connection#close()} on the returned * connection instance. * * Typical usage: *
   * Connection connection = ConnectionFactory.createConnection(conf);
   * Table table = connection.getTable(TableName.valueOf("mytable"));
   * try {
   *   table.get(...);
   *   ...
   * } finally {
   *   table.close();
   *   connection.close();
   * }
   * 
* * @param conf configuration * @param pool the thread pool to use for batch operations * @return Connection object for conf */ public static Connection createConnection(Configuration conf, ExecutorService pool) throws IOException { return createConnection(conf, pool, null); } /** * Create a new Connection instance using the passed conf instance. Connection * encapsulates all housekeeping for a connection to the cluster. All tables and interfaces * created from returned connection share zookeeper connection, meta cache, and connections * to region servers and masters. *
* The caller is responsible for calling {@link Connection#close()} on the returned * connection instance. * * Typical usage: *
   * Connection connection = ConnectionFactory.createConnection(conf);
   * Table table = connection.getTable(TableName.valueOf("table1"));
   * try {
   *   table.get(...);
   *   ...
   * } finally {
   *   table.close();
   *   connection.close();
   * }
   * 
* * @param conf configuration * @param user the user the connection is for * @return Connection object for conf */ public static Connection createConnection(Configuration conf, User user) throws IOException { return createConnection(conf, null, user); } /** * Create a new Connection instance using the passed conf instance. Connection * encapsulates all housekeeping for a connection to the cluster. All tables and interfaces * created from returned connection share zookeeper connection, meta cache, and connections * to region servers and masters. *
* The caller is responsible for calling {@link Connection#close()} on the returned * connection instance. * * Typical usage: *
   * Connection connection = ConnectionFactory.createConnection(conf);
   * Table table = connection.getTable(TableName.valueOf("table1"));
   * try {
   *   table.get(...);
   *   ...
   * } finally {
   *   table.close();
   *   connection.close();
   * }
   * 
* * @param conf configuration * @param user the user the connection is for * @param pool the thread pool to use for batch operations * @return Connection object for conf */ public static Connection createConnection(Configuration conf, ExecutorService pool, User user) throws IOException { if (user == null) { UserProvider provider = UserProvider.instantiate(conf); user = provider.getCurrent(); } return createConnection(conf, false, pool, user); } static Connection createConnection(final Configuration conf, final boolean managed, final ExecutorService pool, final User user) throws IOException { String className = conf.get(HConnection.HBASE_CLIENT_CONNECTION_IMPL, ConnectionManager.HConnectionImplementation.class.getName()); Class clazz = null; try { clazz = Class.forName(className); } catch (ClassNotFoundException e) { throw new IOException(e); } try { // Default HCM#HCI is not accessible; make it so before invoking. Constructor constructor = clazz.getDeclaredConstructor(Configuration.class, boolean.class, ExecutorService.class, User.class); constructor.setAccessible(true); return (Connection) constructor.newInstance(conf, managed, pool, user); } catch (Exception e) { throw new IOException(e); } } //clean up all hbase connection public static void cleanup() { deleteAllConnections(); } //clean up all hbase connection public void cleanupInstance() { deleteAllConnectionsInstance(); } public static Connection getConnection(final Configuration conf) throws IOException { return getConnectionInternal(conf); } public Connection getConnectionInstance(final Configuration conf) throws IOException { return getConnectionInternalInstance(conf); } // An LRU Map of HConnectionKey -> HConnection (TableServer). All // access must be synchronized. This map is not private because tests // need to be able to tinker with it. private static final Map CONNECTION_INSTANCES_CACHE; static { CONNECTION_INSTANCES_CACHE = new LinkedHashMap( (int) (MAX_CACHED_CONNECTION_INSTANCES / 0.75F) + 1, 0.75F, true) { @Override protected boolean removeEldestEntry( Map.Entry eldest) { return size() > MAX_CACHED_CONNECTION_INSTANCES; } }; } private static Connection getConnectionInternal(final Configuration conf) throws IOException { HConnectionKey connectionKey = new HConnectionKey(conf); synchronized (CONNECTION_INSTANCES_CACHE) { Connection connection = CONNECTION_INSTANCES_CACHE.get(connectionKey); if (connection == null) { connection = ConnectionFactory.createConnection(conf); CONNECTION_INSTANCES_CACHE.put(connectionKey, connection); } else if (connection.isClosed()) { deleteConnection(connectionKey); connection = ConnectionFactory.createConnection(conf); CONNECTION_INSTANCES_CACHE.put(connectionKey, connection); } return connection; } } private Connection getConnectionInternalInstance(final Configuration conf) throws IOException { HConnectionKey connectionKey = new HConnectionKey(conf); synchronized (CONNECTION_INSTANCES_CACHE) { Connection connection = CONNECTION_INSTANCES_CACHE.get(connectionKey); if (connection == null) { connection = ConnectionFactory.createConnection(conf); CONNECTION_INSTANCES_CACHE.put(connectionKey, connection); } else if (connection.isClosed()) { deleteConnection(connectionKey); connection = ConnectionFactory.createConnection(conf); CONNECTION_INSTANCES_CACHE.put(connectionKey, connection); } return connection; } } private static void deleteConnection(HConnectionKey connectionKey) { synchronized (CONNECTION_INSTANCES_CACHE) { Connection connection = CONNECTION_INSTANCES_CACHE.get(connectionKey); if (connection != null) { CONNECTION_INSTANCES_CACHE.remove(connectionKey); try { connection.close(); } catch (IOException e) { LOG.error("Failed to close connection in the ConnectionFactory list", e); } } else { LOG.error("Connection not found in the ConnectionFactory list, can't delete it " + "(connection key=" + connectionKey + "). May be the key was modified?", new Exception()); } } } private static void deleteAllConnections() { synchronized (CONNECTION_INSTANCES_CACHE) { Set connectionKeys = new HashSet(); connectionKeys.addAll(CONNECTION_INSTANCES_CACHE.keySet()); for (HConnectionKey connectionKey : connectionKeys) { deleteConnection(connectionKey); } CONNECTION_INSTANCES_CACHE.clear(); } } private void deleteAllConnectionsInstance() { synchronized (CONNECTION_INSTANCES_CACHE) { Set connectionKeys = new HashSet(); connectionKeys.addAll(CONNECTION_INSTANCES_CACHE.keySet()); for (HConnectionKey connectionKey : connectionKeys) { deleteConnection(connectionKey); } CONNECTION_INSTANCES_CACHE.clear(); } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/java/org/apache/hadoop/hbase/spark/SparkSQLPushDownFilter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark; import com.google.protobuf.ByteString; import com.google.protobuf.InvalidProtocolBufferException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.exceptions.DeserializationException; import org.apache.hadoop.hbase.filter.FilterBase; import org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos; import org.apache.hadoop.hbase.util.ByteStringer; import org.apache.hadoop.hbase.util.Bytes; import scala.collection.mutable.MutableList; import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; /** * This filter will push down all qualifier logic given to us * by SparkSQL so that we have make the filters at the region server level * and avoid sending the data back to the client to be filtered. */ public class SparkSQLPushDownFilter extends FilterBase{ protected static final Log log = LogFactory.getLog(SparkSQLPushDownFilter.class); //The following values are populated with protobuffer DynamicLogicExpression dynamicLogicExpression; byte[][] valueFromQueryArray; HashMap> currentCellToColumnIndexMap; //The following values are transient HashMap columnToCurrentRowValueMap = null; static final byte[] rowKeyFamily = new byte[0]; static final byte[] rowKeyQualifier = Bytes.toBytes("key"); public SparkSQLPushDownFilter(DynamicLogicExpression dynamicLogicExpression, byte[][] valueFromQueryArray, HashMap> currentCellToColumnIndexMap) { this.dynamicLogicExpression = dynamicLogicExpression; this.valueFromQueryArray = valueFromQueryArray; this.currentCellToColumnIndexMap = currentCellToColumnIndexMap; } public SparkSQLPushDownFilter(DynamicLogicExpression dynamicLogicExpression, byte[][] valueFromQueryArray, MutableList columnDefinitions) { this.dynamicLogicExpression = dynamicLogicExpression; this.valueFromQueryArray = valueFromQueryArray; //generate family qualifier to index mapping this.currentCellToColumnIndexMap = new HashMap<>(); for (int i = 0; i < columnDefinitions.size(); i++) { SchemaQualifierDefinition definition = columnDefinitions.get(i).get(); ByteArrayComparable familyByteComparable = new ByteArrayComparable(definition.columnFamilyBytes(), 0, definition.columnFamilyBytes().length); HashMap qualifierIndexMap = currentCellToColumnIndexMap.get(familyByteComparable); if (qualifierIndexMap == null) { qualifierIndexMap = new HashMap<>(); currentCellToColumnIndexMap.put(familyByteComparable, qualifierIndexMap); } ByteArrayComparable qualifierByteComparable = new ByteArrayComparable(definition.qualifierBytes(), 0, definition.qualifierBytes().length); qualifierIndexMap.put(qualifierByteComparable, definition.columnName()); } } @Override public ReturnCode filterKeyValue(Cell c) throws IOException { //If the map RowValueMap is empty then we need to populate // the row key if (columnToCurrentRowValueMap == null) { columnToCurrentRowValueMap = new HashMap<>(); HashMap qualifierColumnMap = currentCellToColumnIndexMap.get( new ByteArrayComparable(rowKeyFamily, 0, rowKeyFamily.length)); if (qualifierColumnMap != null) { String rowKeyColumnName = qualifierColumnMap.get( new ByteArrayComparable(rowKeyQualifier, 0, rowKeyQualifier.length)); //Make sure that the rowKey is part of the where clause if (rowKeyColumnName != null) { columnToCurrentRowValueMap.put(rowKeyColumnName, new ByteArrayComparable(c.getRowArray(), c.getRowOffset(), c.getRowLength())); } } } //Always populate the column value into the RowValueMap ByteArrayComparable currentFamilyByteComparable = new ByteArrayComparable(c.getFamilyArray(), c.getFamilyOffset(), c.getFamilyLength()); HashMap qualifierColumnMap = currentCellToColumnIndexMap.get( currentFamilyByteComparable); if (qualifierColumnMap != null) { String columnName = qualifierColumnMap.get( new ByteArrayComparable(c.getQualifierArray(), c.getQualifierOffset(), c.getQualifierLength())); if (columnName != null) { columnToCurrentRowValueMap.put(columnName, new ByteArrayComparable(c.getValueArray(), c.getValueOffset(), c.getValueLength())); } } return ReturnCode.INCLUDE; } @Override public boolean filterRow() throws IOException { try { boolean result = dynamicLogicExpression.execute(columnToCurrentRowValueMap, valueFromQueryArray); columnToCurrentRowValueMap = null; return !result; } catch (Throwable e) { log.error("Error running dynamic logic on row", e); } return false; } /** * @param pbBytes A pb serialized instance * @return An instance of SparkSQLPushDownFilter * @throws org.apache.hadoop.hbase.exceptions.DeserializationException */ @SuppressWarnings("unused") public static SparkSQLPushDownFilter parseFrom(final byte[] pbBytes) throws DeserializationException { FilterProtos.SQLPredicatePushDownFilter proto; try { proto = FilterProtos.SQLPredicatePushDownFilter.parseFrom(pbBytes); } catch (InvalidProtocolBufferException e) { throw new DeserializationException(e); } //Load DynamicLogicExpression DynamicLogicExpression dynamicLogicExpression = DynamicLogicExpressionBuilder.build(proto.getDynamicLogicExpression()); //Load valuesFromQuery final List valueFromQueryArrayList = proto.getValueFromQueryArrayList(); byte[][] valueFromQueryArray = new byte[valueFromQueryArrayList.size()][]; for (int i = 0; i < valueFromQueryArrayList.size(); i++) { valueFromQueryArray[i] = valueFromQueryArrayList.get(i).toByteArray(); } //Load mapping from HBase family/qualifier to Spark SQL columnName HashMap> currentCellToColumnIndexMap = new HashMap<>(); for (FilterProtos.SQLPredicatePushDownCellToColumnMapping sqlPredicatePushDownCellToColumnMapping : proto.getCellToColumnMappingList()) { byte[] familyArray = sqlPredicatePushDownCellToColumnMapping.getColumnFamily().toByteArray(); ByteArrayComparable familyByteComparable = new ByteArrayComparable(familyArray, 0, familyArray.length); HashMap qualifierMap = currentCellToColumnIndexMap.get(familyByteComparable); if (qualifierMap == null) { qualifierMap = new HashMap<>(); currentCellToColumnIndexMap.put(familyByteComparable, qualifierMap); } byte[] qualifierArray = sqlPredicatePushDownCellToColumnMapping.getQualifier().toByteArray(); ByteArrayComparable qualifierByteComparable = new ByteArrayComparable(qualifierArray, 0 ,qualifierArray.length); qualifierMap.put(qualifierByteComparable, sqlPredicatePushDownCellToColumnMapping.getColumnName()); } return new SparkSQLPushDownFilter(dynamicLogicExpression, valueFromQueryArray, currentCellToColumnIndexMap); } /** * @return The filter serialized using pb */ public byte[] toByteArray() { FilterProtos.SQLPredicatePushDownFilter.Builder builder = FilterProtos.SQLPredicatePushDownFilter.newBuilder(); FilterProtos.SQLPredicatePushDownCellToColumnMapping.Builder columnMappingBuilder = FilterProtos.SQLPredicatePushDownCellToColumnMapping.newBuilder(); builder.setDynamicLogicExpression(dynamicLogicExpression.toExpressionString()); for (byte[] valueFromQuery: valueFromQueryArray) { builder.addValueFromQueryArray(ByteStringer.wrap(valueFromQuery)); } for (Map.Entry> familyEntry : currentCellToColumnIndexMap.entrySet()) { for (Map.Entry qualifierEntry : familyEntry.getValue().entrySet()) { columnMappingBuilder.setColumnFamily( ByteStringer.wrap(familyEntry.getKey().bytes())); columnMappingBuilder.setQualifier( ByteStringer.wrap(qualifierEntry.getKey().bytes())); columnMappingBuilder.setColumnName(qualifierEntry.getValue()); builder.addCellToColumnMapping(columnMappingBuilder.build()); } } return builder.build().toByteArray(); } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseBulkDeleteExample.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.hbasecontext; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Delete; import org.apache.hadoop.hbase.spark.JavaHBaseContext; import org.apache.hadoop.hbase.util.Bytes; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import java.util.ArrayList; import java.util.List; /** * This is a simple example of deleting records in HBase * with the bulkDelete function. */ final public class JavaHBaseBulkDeleteExample { private JavaHBaseBulkDeleteExample() {} public static void main(String[] args) { if (args.length < 1) { System.out.println("JavaHBaseBulkDeleteExample {tableName}"); return; } String tableName = args[0]; SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkDeleteExample " + tableName); JavaSparkContext jsc = new JavaSparkContext(sparkConf); try { List list = new ArrayList<>(); list.add(Bytes.toBytes("1")); list.add(Bytes.toBytes("2")); list.add(Bytes.toBytes("3")); list.add(Bytes.toBytes("4")); list.add(Bytes.toBytes("5")); JavaRDD rdd = jsc.parallelize(list); Configuration conf = HBaseConfiguration.create(); JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); hbaseContext.bulkDelete(rdd, TableName.valueOf(tableName), new DeleteFunction(), 4); } finally { jsc.stop(); } } public static class DeleteFunction implements Function { private static final long serialVersionUID = 1L; public Delete call(byte[] v) throws Exception { return new Delete(v); } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseBulkGetExample.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.hbasecontext; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.spark.JavaHBaseContext; import org.apache.hadoop.hbase.util.Bytes; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; /** * This is a simple example of getting records in HBase * with the bulkGet function. */ final public class JavaHBaseBulkGetExample { private JavaHBaseBulkGetExample() {} public static void main(String[] args) { if (args.length < 1) { System.out.println("JavaHBaseBulkGetExample {tableName}"); return; } String tableName = args[0]; SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkGetExample " + tableName); JavaSparkContext jsc = new JavaSparkContext(sparkConf); try { List list = new ArrayList<>(); list.add(Bytes.toBytes("1")); list.add(Bytes.toBytes("2")); list.add(Bytes.toBytes("3")); list.add(Bytes.toBytes("4")); list.add(Bytes.toBytes("5")); JavaRDD rdd = jsc.parallelize(list); Configuration conf = HBaseConfiguration.create(); JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); hbaseContext.bulkGet(TableName.valueOf(tableName), 2, rdd, new GetFunction(), new ResultFunction()); } finally { jsc.stop(); } } public static class GetFunction implements Function { private static final long serialVersionUID = 1L; public Get call(byte[] v) throws Exception { return new Get(v); } } public static class ResultFunction implements Function { private static final long serialVersionUID = 1L; public String call(Result result) throws Exception { Iterator it = result.listCells().iterator(); StringBuilder b = new StringBuilder(); b.append(Bytes.toString(result.getRow())).append(":"); while (it.hasNext()) { Cell cell = it.next(); String q = Bytes.toString(cell.getQualifierArray()); if (q.equals("counter")) { b.append("(") .append(Bytes.toString(cell.getQualifierArray())) .append(",") .append(Bytes.toLong(cell.getValueArray())) .append(")"); } else { b.append("(") .append(Bytes.toString(cell.getQualifierArray())) .append(",") .append(Bytes.toString(cell.getValueArray())) .append(")"); } } return b.toString(); } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseBulkPutExample.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.hbasecontext; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.spark.JavaHBaseContext; import org.apache.hadoop.hbase.util.Bytes; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; /** * This is a simple example of putting records in HBase * with the bulkPut function. */ final public class JavaHBaseBulkPutExample { private JavaHBaseBulkPutExample() {} public static void main(String[] args) { if (args.length < 2) { System.out.println("JavaHBaseBulkPutExample " + "{tableName} {columnFamily}"); return; } String tableName = args[0]; String columnFamily = args[1]; SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkPutExample " + tableName); JavaSparkContext jsc = new JavaSparkContext(sparkConf); try { List list = new ArrayList<>(); list.add("1," + columnFamily + ",a,1"); list.add("2," + columnFamily + ",a,2"); list.add("3," + columnFamily + ",a,3"); list.add("4," + columnFamily + ",a,4"); list.add("5," + columnFamily + ",a,5"); JavaRDD rdd = jsc.parallelize(list); Configuration conf = HBaseConfiguration.create(); JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); hbaseContext.bulkPut(rdd, TableName.valueOf(tableName), new PutFunction()); } finally { jsc.stop(); } } public static class PutFunction implements Function { private static final long serialVersionUID = 1L; public Put call(String v) throws Exception { String[] cells = v.split(","); Put put = new Put(Bytes.toBytes(cells[0])); put.addColumn(Bytes.toBytes(cells[1]), Bytes.toBytes(cells[2]), Bytes.toBytes(cells[3])); return put; } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseDistributedScan.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.hbasecontext; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.spark.JavaHBaseContext; import org.apache.hadoop.hbase.util.Bytes; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import scala.Tuple2; /** * This is a simple example of scanning records from HBase * with the hbaseRDD function. */ final public class JavaHBaseDistributedScan { private JavaHBaseDistributedScan() {} public static void main(String[] args) { if (args.length < 1) { System.out.println("JavaHBaseDistributedScan {tableName}"); return; } String tableName = args[0]; SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseDistributedScan " + tableName); JavaSparkContext jsc = new JavaSparkContext(sparkConf); try { Configuration conf = HBaseConfiguration.create(); JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); Scan scan = new Scan(); scan.setCaching(100); JavaRDD> javaRdd = hbaseContext.hbaseRDD(TableName.valueOf(tableName), scan); List results = javaRdd.map(new ScanConvertFunction()).collect(); System.out.println("Result Size: " + results.size()); } finally { jsc.stop(); } } private static class ScanConvertFunction implements Function, String> { @Override public String call(Tuple2 v1) throws Exception { return Bytes.toString(v1._1().copyBytes()); } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseMapGetPutExample.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.hbasecontext; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.BufferedMutator; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.spark.JavaHBaseContext; import org.apache.hadoop.hbase.util.Bytes; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; /** * This is a simple example of using the foreachPartition * method with a HBase connection */ final public class JavaHBaseMapGetPutExample { private JavaHBaseMapGetPutExample() {} public static void main(String[] args) { if (args.length < 1) { System.out.println("JavaHBaseBulkGetExample {tableName}"); return; } final String tableName = args[0]; SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkGetExample " + tableName); JavaSparkContext jsc = new JavaSparkContext(sparkConf); try { List list = new ArrayList<>(); list.add(Bytes.toBytes("1")); list.add(Bytes.toBytes("2")); list.add(Bytes.toBytes("3")); list.add(Bytes.toBytes("4")); list.add(Bytes.toBytes("5")); JavaRDD rdd = jsc.parallelize(list); Configuration conf = HBaseConfiguration.create(); JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); hbaseContext.foreachPartition(rdd, new VoidFunction, Connection>>() { public void call(Tuple2, Connection> t) throws Exception { Table table = t._2().getTable(TableName.valueOf(tableName)); BufferedMutator mutator = t._2().getBufferedMutator(TableName.valueOf(tableName)); while (t._1().hasNext()) { byte[] b = t._1().next(); Result r = table.get(new Get(b)); if (r.getExists()) { mutator.mutate(new Put(b)); } } mutator.flush(); mutator.close(); table.close(); } }); } finally { jsc.stop(); } } public static class GetFunction implements Function { private static final long serialVersionUID = 1L; public Get call(byte[] v) throws Exception { return new Get(v); } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/java/org/apache/hadoop/hbase/spark/example/hbasecontext/JavaHBaseStreamingBulkPutExample.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.hbasecontext; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.spark.JavaHBaseContext; import org.apache.hadoop.hbase.util.Bytes; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.streaming.Duration; import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; /** * This is a simple example of BulkPut with Spark Streaming */ final public class JavaHBaseStreamingBulkPutExample { private JavaHBaseStreamingBulkPutExample() {} public static void main(String[] args) { if (args.length < 4) { System.out.println("JavaHBaseBulkPutExample " + "{host} {port} {tableName}"); return; } String host = args[0]; String port = args[1]; String tableName = args[2]; SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseStreamingBulkPutExample " + tableName + ":" + port + ":" + tableName); JavaSparkContext jsc = new JavaSparkContext(sparkConf); try { JavaStreamingContext jssc = new JavaStreamingContext(jsc, new Duration(1000)); JavaReceiverInputDStream javaDstream = jssc.socketTextStream(host, Integer.parseInt(port)); Configuration conf = HBaseConfiguration.create(); JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); hbaseContext.streamBulkPut(javaDstream, TableName.valueOf(tableName), new PutFunction()); } finally { jsc.stop(); } } public static class PutFunction implements Function { private static final long serialVersionUID = 1L; public Put call(String v) throws Exception { String[] part = v.split(","); Put put = new Put(Bytes.toBytes(part[0])); put.addColumn(Bytes.toBytes(part[1]), Bytes.toBytes(part[2]), Bytes.toBytes(part[3])); return put; } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/java/org/apache/hadoop/hbase/spark/protobuf/generated/FilterProtos.java ================================================ // Generated by the protocol buffer compiler. DO NOT EDIT! // source: Filter.proto package org.apache.hadoop.hbase.spark.protobuf.generated; public final class FilterProtos { private FilterProtos() {} public static void registerAllExtensions( com.google.protobuf.ExtensionRegistry registry) { } public interface SQLPredicatePushDownCellToColumnMappingOrBuilder extends com.google.protobuf.MessageOrBuilder { // required bytes column_family = 1; /** * required bytes column_family = 1; */ boolean hasColumnFamily(); /** * required bytes column_family = 1; */ com.google.protobuf.ByteString getColumnFamily(); // required bytes qualifier = 2; /** * required bytes qualifier = 2; */ boolean hasQualifier(); /** * required bytes qualifier = 2; */ com.google.protobuf.ByteString getQualifier(); // required string column_name = 3; /** * required string column_name = 3; */ boolean hasColumnName(); /** * required string column_name = 3; */ java.lang.String getColumnName(); /** * required string column_name = 3; */ com.google.protobuf.ByteString getColumnNameBytes(); } /** * Protobuf type {@code hbase.pb.SQLPredicatePushDownCellToColumnMapping} */ public static final class SQLPredicatePushDownCellToColumnMapping extends com.google.protobuf.GeneratedMessage implements SQLPredicatePushDownCellToColumnMappingOrBuilder { // Use SQLPredicatePushDownCellToColumnMapping.newBuilder() to construct. private SQLPredicatePushDownCellToColumnMapping(com.google.protobuf.GeneratedMessage.Builder builder) { super(builder); this.unknownFields = builder.getUnknownFields(); } private SQLPredicatePushDownCellToColumnMapping(boolean noInit) { this.unknownFields = com.google.protobuf.UnknownFieldSet.getDefaultInstance(); } private static final SQLPredicatePushDownCellToColumnMapping defaultInstance; public static SQLPredicatePushDownCellToColumnMapping getDefaultInstance() { return defaultInstance; } public SQLPredicatePushDownCellToColumnMapping getDefaultInstanceForType() { return defaultInstance; } private final com.google.protobuf.UnknownFieldSet unknownFields; @java.lang.Override public final com.google.protobuf.UnknownFieldSet getUnknownFields() { return this.unknownFields; } private SQLPredicatePushDownCellToColumnMapping( com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException { initFields(); int mutable_bitField0_ = 0; com.google.protobuf.UnknownFieldSet.Builder unknownFields = com.google.protobuf.UnknownFieldSet.newBuilder(); try { boolean done = false; while (!done) { int tag = input.readTag(); switch (tag) { case 0: done = true; break; default: { if (!parseUnknownField(input, unknownFields, extensionRegistry, tag)) { done = true; } break; } case 10: { bitField0_ |= 0x00000001; columnFamily_ = input.readBytes(); break; } case 18: { bitField0_ |= 0x00000002; qualifier_ = input.readBytes(); break; } case 26: { bitField0_ |= 0x00000004; columnName_ = input.readBytes(); break; } } } } catch (com.google.protobuf.InvalidProtocolBufferException e) { throw e.setUnfinishedMessage(this); } catch (java.io.IOException e) { throw new com.google.protobuf.InvalidProtocolBufferException( e.getMessage()).setUnfinishedMessage(this); } finally { this.unknownFields = unknownFields.build(); makeExtensionsImmutable(); } } public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() { return org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.internal_static_hbase_pb_SQLPredicatePushDownCellToColumnMapping_descriptor; } protected com.google.protobuf.GeneratedMessage.FieldAccessorTable internalGetFieldAccessorTable() { return org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.internal_static_hbase_pb_SQLPredicatePushDownCellToColumnMapping_fieldAccessorTable .ensureFieldAccessorsInitialized( org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.class, org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.Builder.class); } public static com.google.protobuf.Parser PARSER = new com.google.protobuf.AbstractParser() { public SQLPredicatePushDownCellToColumnMapping parsePartialFrom( com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException { return new SQLPredicatePushDownCellToColumnMapping(input, extensionRegistry); } }; @java.lang.Override public com.google.protobuf.Parser getParserForType() { return PARSER; } private int bitField0_; // required bytes column_family = 1; public static final int COLUMN_FAMILY_FIELD_NUMBER = 1; private com.google.protobuf.ByteString columnFamily_; /** * required bytes column_family = 1; */ public boolean hasColumnFamily() { return ((bitField0_ & 0x00000001) == 0x00000001); } /** * required bytes column_family = 1; */ public com.google.protobuf.ByteString getColumnFamily() { return columnFamily_; } // required bytes qualifier = 2; public static final int QUALIFIER_FIELD_NUMBER = 2; private com.google.protobuf.ByteString qualifier_; /** * required bytes qualifier = 2; */ public boolean hasQualifier() { return ((bitField0_ & 0x00000002) == 0x00000002); } /** * required bytes qualifier = 2; */ public com.google.protobuf.ByteString getQualifier() { return qualifier_; } // required string column_name = 3; public static final int COLUMN_NAME_FIELD_NUMBER = 3; private java.lang.Object columnName_; /** * required string column_name = 3; */ public boolean hasColumnName() { return ((bitField0_ & 0x00000004) == 0x00000004); } /** * required string column_name = 3; */ public java.lang.String getColumnName() { java.lang.Object ref = columnName_; if (ref instanceof java.lang.String) { return (java.lang.String) ref; } else { com.google.protobuf.ByteString bs = (com.google.protobuf.ByteString) ref; java.lang.String s = bs.toStringUtf8(); if (bs.isValidUtf8()) { columnName_ = s; } return s; } } /** * required string column_name = 3; */ public com.google.protobuf.ByteString getColumnNameBytes() { java.lang.Object ref = columnName_; if (ref instanceof java.lang.String) { com.google.protobuf.ByteString b = com.google.protobuf.ByteString.copyFromUtf8( (java.lang.String) ref); columnName_ = b; return b; } else { return (com.google.protobuf.ByteString) ref; } } private void initFields() { columnFamily_ = com.google.protobuf.ByteString.EMPTY; qualifier_ = com.google.protobuf.ByteString.EMPTY; columnName_ = ""; } private byte memoizedIsInitialized = -1; public final boolean isInitialized() { byte isInitialized = memoizedIsInitialized; if (isInitialized != -1) return isInitialized == 1; if (!hasColumnFamily()) { memoizedIsInitialized = 0; return false; } if (!hasQualifier()) { memoizedIsInitialized = 0; return false; } if (!hasColumnName()) { memoizedIsInitialized = 0; return false; } memoizedIsInitialized = 1; return true; } public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException { getSerializedSize(); if (((bitField0_ & 0x00000001) == 0x00000001)) { output.writeBytes(1, columnFamily_); } if (((bitField0_ & 0x00000002) == 0x00000002)) { output.writeBytes(2, qualifier_); } if (((bitField0_ & 0x00000004) == 0x00000004)) { output.writeBytes(3, getColumnNameBytes()); } getUnknownFields().writeTo(output); } private int memoizedSerializedSize = -1; public int getSerializedSize() { int size = memoizedSerializedSize; if (size != -1) return size; size = 0; if (((bitField0_ & 0x00000001) == 0x00000001)) { size += com.google.protobuf.CodedOutputStream .computeBytesSize(1, columnFamily_); } if (((bitField0_ & 0x00000002) == 0x00000002)) { size += com.google.protobuf.CodedOutputStream .computeBytesSize(2, qualifier_); } if (((bitField0_ & 0x00000004) == 0x00000004)) { size += com.google.protobuf.CodedOutputStream .computeBytesSize(3, getColumnNameBytes()); } size += getUnknownFields().getSerializedSize(); memoizedSerializedSize = size; return size; } private static final long serialVersionUID = 0L; @java.lang.Override protected java.lang.Object writeReplace() throws java.io.ObjectStreamException { return super.writeReplace(); } @java.lang.Override public boolean equals(final java.lang.Object obj) { if (obj == this) { return true; } if (!(obj instanceof org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping)) { return super.equals(obj); } org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping other = (org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping) obj; boolean result = true; result = result && (hasColumnFamily() == other.hasColumnFamily()); if (hasColumnFamily()) { result = result && getColumnFamily() .equals(other.getColumnFamily()); } result = result && (hasQualifier() == other.hasQualifier()); if (hasQualifier()) { result = result && getQualifier() .equals(other.getQualifier()); } result = result && (hasColumnName() == other.hasColumnName()); if (hasColumnName()) { result = result && getColumnName() .equals(other.getColumnName()); } result = result && getUnknownFields().equals(other.getUnknownFields()); return result; } private int memoizedHashCode = 0; @java.lang.Override public int hashCode() { if (memoizedHashCode != 0) { return memoizedHashCode; } int hash = 41; hash = (19 * hash) + getDescriptorForType().hashCode(); if (hasColumnFamily()) { hash = (37 * hash) + COLUMN_FAMILY_FIELD_NUMBER; hash = (53 * hash) + getColumnFamily().hashCode(); } if (hasQualifier()) { hash = (37 * hash) + QUALIFIER_FIELD_NUMBER; hash = (53 * hash) + getQualifier().hashCode(); } if (hasColumnName()) { hash = (37 * hash) + COLUMN_NAME_FIELD_NUMBER; hash = (53 * hash) + getColumnName().hashCode(); } hash = (29 * hash) + getUnknownFields().hashCode(); memoizedHashCode = hash; return hash; } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping parseFrom( com.google.protobuf.ByteString data) throws com.google.protobuf.InvalidProtocolBufferException { return PARSER.parseFrom(data); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping parseFrom( com.google.protobuf.ByteString data, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException { return PARSER.parseFrom(data, extensionRegistry); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping parseFrom(byte[] data) throws com.google.protobuf.InvalidProtocolBufferException { return PARSER.parseFrom(data); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping parseFrom( byte[] data, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException { return PARSER.parseFrom(data, extensionRegistry); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping parseFrom(java.io.InputStream input) throws java.io.IOException { return PARSER.parseFrom(input); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping parseFrom( java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException { return PARSER.parseFrom(input, extensionRegistry); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping parseDelimitedFrom(java.io.InputStream input) throws java.io.IOException { return PARSER.parseDelimitedFrom(input); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping parseDelimitedFrom( java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException { return PARSER.parseDelimitedFrom(input, extensionRegistry); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping parseFrom( com.google.protobuf.CodedInputStream input) throws java.io.IOException { return PARSER.parseFrom(input); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping parseFrom( com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException { return PARSER.parseFrom(input, extensionRegistry); } public static Builder newBuilder() { return Builder.create(); } public Builder newBuilderForType() { return newBuilder(); } public static Builder newBuilder(org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping prototype) { return newBuilder().mergeFrom(prototype); } public Builder toBuilder() { return newBuilder(this); } @java.lang.Override protected Builder newBuilderForType( com.google.protobuf.GeneratedMessage.BuilderParent parent) { Builder builder = new Builder(parent); return builder; } /** * Protobuf type {@code hbase.pb.SQLPredicatePushDownCellToColumnMapping} */ public static final class Builder extends com.google.protobuf.GeneratedMessage.Builder implements org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMappingOrBuilder { public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() { return org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.internal_static_hbase_pb_SQLPredicatePushDownCellToColumnMapping_descriptor; } protected com.google.protobuf.GeneratedMessage.FieldAccessorTable internalGetFieldAccessorTable() { return org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.internal_static_hbase_pb_SQLPredicatePushDownCellToColumnMapping_fieldAccessorTable .ensureFieldAccessorsInitialized( org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.class, org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.Builder.class); } // Construct using org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.newBuilder() private Builder() { maybeForceBuilderInitialization(); } private Builder( com.google.protobuf.GeneratedMessage.BuilderParent parent) { super(parent); maybeForceBuilderInitialization(); } private void maybeForceBuilderInitialization() { if (com.google.protobuf.GeneratedMessage.alwaysUseFieldBuilders) { } } private static Builder create() { return new Builder(); } public Builder clear() { super.clear(); columnFamily_ = com.google.protobuf.ByteString.EMPTY; bitField0_ = (bitField0_ & ~0x00000001); qualifier_ = com.google.protobuf.ByteString.EMPTY; bitField0_ = (bitField0_ & ~0x00000002); columnName_ = ""; bitField0_ = (bitField0_ & ~0x00000004); return this; } public Builder clone() { return create().mergeFrom(buildPartial()); } public com.google.protobuf.Descriptors.Descriptor getDescriptorForType() { return org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.internal_static_hbase_pb_SQLPredicatePushDownCellToColumnMapping_descriptor; } public org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping getDefaultInstanceForType() { return org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.getDefaultInstance(); } public org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping build() { org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping result = buildPartial(); if (!result.isInitialized()) { throw newUninitializedMessageException(result); } return result; } public org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping buildPartial() { org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping result = new org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping(this); int from_bitField0_ = bitField0_; int to_bitField0_ = 0; if (((from_bitField0_ & 0x00000001) == 0x00000001)) { to_bitField0_ |= 0x00000001; } result.columnFamily_ = columnFamily_; if (((from_bitField0_ & 0x00000002) == 0x00000002)) { to_bitField0_ |= 0x00000002; } result.qualifier_ = qualifier_; if (((from_bitField0_ & 0x00000004) == 0x00000004)) { to_bitField0_ |= 0x00000004; } result.columnName_ = columnName_; result.bitField0_ = to_bitField0_; onBuilt(); return result; } public Builder mergeFrom(com.google.protobuf.Message other) { if (other instanceof org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping) { return mergeFrom((org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping)other); } else { super.mergeFrom(other); return this; } } public Builder mergeFrom(org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping other) { if (other == org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.getDefaultInstance()) return this; if (other.hasColumnFamily()) { setColumnFamily(other.getColumnFamily()); } if (other.hasQualifier()) { setQualifier(other.getQualifier()); } if (other.hasColumnName()) { bitField0_ |= 0x00000004; columnName_ = other.columnName_; onChanged(); } this.mergeUnknownFields(other.getUnknownFields()); return this; } public final boolean isInitialized() { if (!hasColumnFamily()) { return false; } if (!hasQualifier()) { return false; } if (!hasColumnName()) { return false; } return true; } public Builder mergeFrom( com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException { org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping parsedMessage = null; try { parsedMessage = PARSER.parsePartialFrom(input, extensionRegistry); } catch (com.google.protobuf.InvalidProtocolBufferException e) { parsedMessage = (org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping) e.getUnfinishedMessage(); throw e; } finally { if (parsedMessage != null) { mergeFrom(parsedMessage); } } return this; } private int bitField0_; // required bytes column_family = 1; private com.google.protobuf.ByteString columnFamily_ = com.google.protobuf.ByteString.EMPTY; /** * required bytes column_family = 1; */ public boolean hasColumnFamily() { return ((bitField0_ & 0x00000001) == 0x00000001); } /** * required bytes column_family = 1; */ public com.google.protobuf.ByteString getColumnFamily() { return columnFamily_; } /** * required bytes column_family = 1; */ public Builder setColumnFamily(com.google.protobuf.ByteString value) { if (value == null) { throw new NullPointerException(); } bitField0_ |= 0x00000001; columnFamily_ = value; onChanged(); return this; } /** * required bytes column_family = 1; */ public Builder clearColumnFamily() { bitField0_ = (bitField0_ & ~0x00000001); columnFamily_ = getDefaultInstance().getColumnFamily(); onChanged(); return this; } // required bytes qualifier = 2; private com.google.protobuf.ByteString qualifier_ = com.google.protobuf.ByteString.EMPTY; /** * required bytes qualifier = 2; */ public boolean hasQualifier() { return ((bitField0_ & 0x00000002) == 0x00000002); } /** * required bytes qualifier = 2; */ public com.google.protobuf.ByteString getQualifier() { return qualifier_; } /** * required bytes qualifier = 2; */ public Builder setQualifier(com.google.protobuf.ByteString value) { if (value == null) { throw new NullPointerException(); } bitField0_ |= 0x00000002; qualifier_ = value; onChanged(); return this; } /** * required bytes qualifier = 2; */ public Builder clearQualifier() { bitField0_ = (bitField0_ & ~0x00000002); qualifier_ = getDefaultInstance().getQualifier(); onChanged(); return this; } // required string column_name = 3; private java.lang.Object columnName_ = ""; /** * required string column_name = 3; */ public boolean hasColumnName() { return ((bitField0_ & 0x00000004) == 0x00000004); } /** * required string column_name = 3; */ public java.lang.String getColumnName() { java.lang.Object ref = columnName_; if (!(ref instanceof java.lang.String)) { java.lang.String s = ((com.google.protobuf.ByteString) ref) .toStringUtf8(); columnName_ = s; return s; } else { return (java.lang.String) ref; } } /** * required string column_name = 3; */ public com.google.protobuf.ByteString getColumnNameBytes() { java.lang.Object ref = columnName_; if (ref instanceof String) { com.google.protobuf.ByteString b = com.google.protobuf.ByteString.copyFromUtf8( (java.lang.String) ref); columnName_ = b; return b; } else { return (com.google.protobuf.ByteString) ref; } } /** * required string column_name = 3; */ public Builder setColumnName( java.lang.String value) { if (value == null) { throw new NullPointerException(); } bitField0_ |= 0x00000004; columnName_ = value; onChanged(); return this; } /** * required string column_name = 3; */ public Builder clearColumnName() { bitField0_ = (bitField0_ & ~0x00000004); columnName_ = getDefaultInstance().getColumnName(); onChanged(); return this; } /** * required string column_name = 3; */ public Builder setColumnNameBytes( com.google.protobuf.ByteString value) { if (value == null) { throw new NullPointerException(); } bitField0_ |= 0x00000004; columnName_ = value; onChanged(); return this; } // @@protoc_insertion_point(builder_scope:hbase.pb.SQLPredicatePushDownCellToColumnMapping) } static { defaultInstance = new SQLPredicatePushDownCellToColumnMapping(true); defaultInstance.initFields(); } // @@protoc_insertion_point(class_scope:hbase.pb.SQLPredicatePushDownCellToColumnMapping) } public interface SQLPredicatePushDownFilterOrBuilder extends com.google.protobuf.MessageOrBuilder { // required string dynamic_logic_expression = 1; /** * required string dynamic_logic_expression = 1; */ boolean hasDynamicLogicExpression(); /** * required string dynamic_logic_expression = 1; */ java.lang.String getDynamicLogicExpression(); /** * required string dynamic_logic_expression = 1; */ com.google.protobuf.ByteString getDynamicLogicExpressionBytes(); // repeated bytes value_from_query_array = 2; /** * repeated bytes value_from_query_array = 2; */ java.util.List getValueFromQueryArrayList(); /** * repeated bytes value_from_query_array = 2; */ int getValueFromQueryArrayCount(); /** * repeated bytes value_from_query_array = 2; */ com.google.protobuf.ByteString getValueFromQueryArray(int index); // repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ java.util.List getCellToColumnMappingList(); /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping getCellToColumnMapping(int index); /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ int getCellToColumnMappingCount(); /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ java.util.List getCellToColumnMappingOrBuilderList(); /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMappingOrBuilder getCellToColumnMappingOrBuilder( int index); } /** * Protobuf type {@code hbase.pb.SQLPredicatePushDownFilter} */ public static final class SQLPredicatePushDownFilter extends com.google.protobuf.GeneratedMessage implements SQLPredicatePushDownFilterOrBuilder { // Use SQLPredicatePushDownFilter.newBuilder() to construct. private SQLPredicatePushDownFilter(com.google.protobuf.GeneratedMessage.Builder builder) { super(builder); this.unknownFields = builder.getUnknownFields(); } private SQLPredicatePushDownFilter(boolean noInit) { this.unknownFields = com.google.protobuf.UnknownFieldSet.getDefaultInstance(); } private static final SQLPredicatePushDownFilter defaultInstance; public static SQLPredicatePushDownFilter getDefaultInstance() { return defaultInstance; } public SQLPredicatePushDownFilter getDefaultInstanceForType() { return defaultInstance; } private final com.google.protobuf.UnknownFieldSet unknownFields; @java.lang.Override public final com.google.protobuf.UnknownFieldSet getUnknownFields() { return this.unknownFields; } private SQLPredicatePushDownFilter( com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException { initFields(); int mutable_bitField0_ = 0; com.google.protobuf.UnknownFieldSet.Builder unknownFields = com.google.protobuf.UnknownFieldSet.newBuilder(); try { boolean done = false; while (!done) { int tag = input.readTag(); switch (tag) { case 0: done = true; break; default: { if (!parseUnknownField(input, unknownFields, extensionRegistry, tag)) { done = true; } break; } case 10: { bitField0_ |= 0x00000001; dynamicLogicExpression_ = input.readBytes(); break; } case 18: { if (!((mutable_bitField0_ & 0x00000002) == 0x00000002)) { valueFromQueryArray_ = new java.util.ArrayList(); mutable_bitField0_ |= 0x00000002; } valueFromQueryArray_.add(input.readBytes()); break; } case 26: { if (!((mutable_bitField0_ & 0x00000004) == 0x00000004)) { cellToColumnMapping_ = new java.util.ArrayList(); mutable_bitField0_ |= 0x00000004; } cellToColumnMapping_.add(input.readMessage(org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.PARSER, extensionRegistry)); break; } } } } catch (com.google.protobuf.InvalidProtocolBufferException e) { throw e.setUnfinishedMessage(this); } catch (java.io.IOException e) { throw new com.google.protobuf.InvalidProtocolBufferException( e.getMessage()).setUnfinishedMessage(this); } finally { if (((mutable_bitField0_ & 0x00000002) == 0x00000002)) { valueFromQueryArray_ = java.util.Collections.unmodifiableList(valueFromQueryArray_); } if (((mutable_bitField0_ & 0x00000004) == 0x00000004)) { cellToColumnMapping_ = java.util.Collections.unmodifiableList(cellToColumnMapping_); } this.unknownFields = unknownFields.build(); makeExtensionsImmutable(); } } public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() { return org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.internal_static_hbase_pb_SQLPredicatePushDownFilter_descriptor; } protected com.google.protobuf.GeneratedMessage.FieldAccessorTable internalGetFieldAccessorTable() { return org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.internal_static_hbase_pb_SQLPredicatePushDownFilter_fieldAccessorTable .ensureFieldAccessorsInitialized( org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter.class, org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter.Builder.class); } public static com.google.protobuf.Parser PARSER = new com.google.protobuf.AbstractParser() { public SQLPredicatePushDownFilter parsePartialFrom( com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException { return new SQLPredicatePushDownFilter(input, extensionRegistry); } }; @java.lang.Override public com.google.protobuf.Parser getParserForType() { return PARSER; } private int bitField0_; // required string dynamic_logic_expression = 1; public static final int DYNAMIC_LOGIC_EXPRESSION_FIELD_NUMBER = 1; private java.lang.Object dynamicLogicExpression_; /** * required string dynamic_logic_expression = 1; */ public boolean hasDynamicLogicExpression() { return ((bitField0_ & 0x00000001) == 0x00000001); } /** * required string dynamic_logic_expression = 1; */ public java.lang.String getDynamicLogicExpression() { java.lang.Object ref = dynamicLogicExpression_; if (ref instanceof java.lang.String) { return (java.lang.String) ref; } else { com.google.protobuf.ByteString bs = (com.google.protobuf.ByteString) ref; java.lang.String s = bs.toStringUtf8(); if (bs.isValidUtf8()) { dynamicLogicExpression_ = s; } return s; } } /** * required string dynamic_logic_expression = 1; */ public com.google.protobuf.ByteString getDynamicLogicExpressionBytes() { java.lang.Object ref = dynamicLogicExpression_; if (ref instanceof java.lang.String) { com.google.protobuf.ByteString b = com.google.protobuf.ByteString.copyFromUtf8( (java.lang.String) ref); dynamicLogicExpression_ = b; return b; } else { return (com.google.protobuf.ByteString) ref; } } // repeated bytes value_from_query_array = 2; public static final int VALUE_FROM_QUERY_ARRAY_FIELD_NUMBER = 2; private java.util.List valueFromQueryArray_; /** * repeated bytes value_from_query_array = 2; */ public java.util.List getValueFromQueryArrayList() { return valueFromQueryArray_; } /** * repeated bytes value_from_query_array = 2; */ public int getValueFromQueryArrayCount() { return valueFromQueryArray_.size(); } /** * repeated bytes value_from_query_array = 2; */ public com.google.protobuf.ByteString getValueFromQueryArray(int index) { return valueFromQueryArray_.get(index); } // repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; public static final int CELL_TO_COLUMN_MAPPING_FIELD_NUMBER = 3; private java.util.List cellToColumnMapping_; /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public java.util.List getCellToColumnMappingList() { return cellToColumnMapping_; } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public java.util.List getCellToColumnMappingOrBuilderList() { return cellToColumnMapping_; } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public int getCellToColumnMappingCount() { return cellToColumnMapping_.size(); } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping getCellToColumnMapping(int index) { return cellToColumnMapping_.get(index); } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMappingOrBuilder getCellToColumnMappingOrBuilder( int index) { return cellToColumnMapping_.get(index); } private void initFields() { dynamicLogicExpression_ = ""; valueFromQueryArray_ = java.util.Collections.emptyList(); cellToColumnMapping_ = java.util.Collections.emptyList(); } private byte memoizedIsInitialized = -1; public final boolean isInitialized() { byte isInitialized = memoizedIsInitialized; if (isInitialized != -1) return isInitialized == 1; if (!hasDynamicLogicExpression()) { memoizedIsInitialized = 0; return false; } for (int i = 0; i < getCellToColumnMappingCount(); i++) { if (!getCellToColumnMapping(i).isInitialized()) { memoizedIsInitialized = 0; return false; } } memoizedIsInitialized = 1; return true; } public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException { getSerializedSize(); if (((bitField0_ & 0x00000001) == 0x00000001)) { output.writeBytes(1, getDynamicLogicExpressionBytes()); } for (int i = 0; i < valueFromQueryArray_.size(); i++) { output.writeBytes(2, valueFromQueryArray_.get(i)); } for (int i = 0; i < cellToColumnMapping_.size(); i++) { output.writeMessage(3, cellToColumnMapping_.get(i)); } getUnknownFields().writeTo(output); } private int memoizedSerializedSize = -1; public int getSerializedSize() { int size = memoizedSerializedSize; if (size != -1) return size; size = 0; if (((bitField0_ & 0x00000001) == 0x00000001)) { size += com.google.protobuf.CodedOutputStream .computeBytesSize(1, getDynamicLogicExpressionBytes()); } { int dataSize = 0; for (int i = 0; i < valueFromQueryArray_.size(); i++) { dataSize += com.google.protobuf.CodedOutputStream .computeBytesSizeNoTag(valueFromQueryArray_.get(i)); } size += dataSize; size += 1 * getValueFromQueryArrayList().size(); } for (int i = 0; i < cellToColumnMapping_.size(); i++) { size += com.google.protobuf.CodedOutputStream .computeMessageSize(3, cellToColumnMapping_.get(i)); } size += getUnknownFields().getSerializedSize(); memoizedSerializedSize = size; return size; } private static final long serialVersionUID = 0L; @java.lang.Override protected java.lang.Object writeReplace() throws java.io.ObjectStreamException { return super.writeReplace(); } @java.lang.Override public boolean equals(final java.lang.Object obj) { if (obj == this) { return true; } if (!(obj instanceof org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter)) { return super.equals(obj); } org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter other = (org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter) obj; boolean result = true; result = result && (hasDynamicLogicExpression() == other.hasDynamicLogicExpression()); if (hasDynamicLogicExpression()) { result = result && getDynamicLogicExpression() .equals(other.getDynamicLogicExpression()); } result = result && getValueFromQueryArrayList() .equals(other.getValueFromQueryArrayList()); result = result && getCellToColumnMappingList() .equals(other.getCellToColumnMappingList()); result = result && getUnknownFields().equals(other.getUnknownFields()); return result; } private int memoizedHashCode = 0; @java.lang.Override public int hashCode() { if (memoizedHashCode != 0) { return memoizedHashCode; } int hash = 41; hash = (19 * hash) + getDescriptorForType().hashCode(); if (hasDynamicLogicExpression()) { hash = (37 * hash) + DYNAMIC_LOGIC_EXPRESSION_FIELD_NUMBER; hash = (53 * hash) + getDynamicLogicExpression().hashCode(); } if (getValueFromQueryArrayCount() > 0) { hash = (37 * hash) + VALUE_FROM_QUERY_ARRAY_FIELD_NUMBER; hash = (53 * hash) + getValueFromQueryArrayList().hashCode(); } if (getCellToColumnMappingCount() > 0) { hash = (37 * hash) + CELL_TO_COLUMN_MAPPING_FIELD_NUMBER; hash = (53 * hash) + getCellToColumnMappingList().hashCode(); } hash = (29 * hash) + getUnknownFields().hashCode(); memoizedHashCode = hash; return hash; } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter parseFrom( com.google.protobuf.ByteString data) throws com.google.protobuf.InvalidProtocolBufferException { return PARSER.parseFrom(data); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter parseFrom( com.google.protobuf.ByteString data, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException { return PARSER.parseFrom(data, extensionRegistry); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter parseFrom(byte[] data) throws com.google.protobuf.InvalidProtocolBufferException { return PARSER.parseFrom(data); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter parseFrom( byte[] data, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws com.google.protobuf.InvalidProtocolBufferException { return PARSER.parseFrom(data, extensionRegistry); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter parseFrom(java.io.InputStream input) throws java.io.IOException { return PARSER.parseFrom(input); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter parseFrom( java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException { return PARSER.parseFrom(input, extensionRegistry); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter parseDelimitedFrom(java.io.InputStream input) throws java.io.IOException { return PARSER.parseDelimitedFrom(input); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter parseDelimitedFrom( java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException { return PARSER.parseDelimitedFrom(input, extensionRegistry); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter parseFrom( com.google.protobuf.CodedInputStream input) throws java.io.IOException { return PARSER.parseFrom(input); } public static org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter parseFrom( com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException { return PARSER.parseFrom(input, extensionRegistry); } public static Builder newBuilder() { return Builder.create(); } public Builder newBuilderForType() { return newBuilder(); } public static Builder newBuilder(org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter prototype) { return newBuilder().mergeFrom(prototype); } public Builder toBuilder() { return newBuilder(this); } @java.lang.Override protected Builder newBuilderForType( com.google.protobuf.GeneratedMessage.BuilderParent parent) { Builder builder = new Builder(parent); return builder; } /** * Protobuf type {@code hbase.pb.SQLPredicatePushDownFilter} */ public static final class Builder extends com.google.protobuf.GeneratedMessage.Builder implements org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilterOrBuilder { public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() { return org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.internal_static_hbase_pb_SQLPredicatePushDownFilter_descriptor; } protected com.google.protobuf.GeneratedMessage.FieldAccessorTable internalGetFieldAccessorTable() { return org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.internal_static_hbase_pb_SQLPredicatePushDownFilter_fieldAccessorTable .ensureFieldAccessorsInitialized( org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter.class, org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter.Builder.class); } // Construct using org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter.newBuilder() private Builder() { maybeForceBuilderInitialization(); } private Builder( com.google.protobuf.GeneratedMessage.BuilderParent parent) { super(parent); maybeForceBuilderInitialization(); } private void maybeForceBuilderInitialization() { if (com.google.protobuf.GeneratedMessage.alwaysUseFieldBuilders) { getCellToColumnMappingFieldBuilder(); } } private static Builder create() { return new Builder(); } public Builder clear() { super.clear(); dynamicLogicExpression_ = ""; bitField0_ = (bitField0_ & ~0x00000001); valueFromQueryArray_ = java.util.Collections.emptyList(); bitField0_ = (bitField0_ & ~0x00000002); if (cellToColumnMappingBuilder_ == null) { cellToColumnMapping_ = java.util.Collections.emptyList(); bitField0_ = (bitField0_ & ~0x00000004); } else { cellToColumnMappingBuilder_.clear(); } return this; } public Builder clone() { return create().mergeFrom(buildPartial()); } public com.google.protobuf.Descriptors.Descriptor getDescriptorForType() { return org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.internal_static_hbase_pb_SQLPredicatePushDownFilter_descriptor; } public org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter getDefaultInstanceForType() { return org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter.getDefaultInstance(); } public org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter build() { org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter result = buildPartial(); if (!result.isInitialized()) { throw newUninitializedMessageException(result); } return result; } public org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter buildPartial() { org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter result = new org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter(this); int from_bitField0_ = bitField0_; int to_bitField0_ = 0; if (((from_bitField0_ & 0x00000001) == 0x00000001)) { to_bitField0_ |= 0x00000001; } result.dynamicLogicExpression_ = dynamicLogicExpression_; if (((bitField0_ & 0x00000002) == 0x00000002)) { valueFromQueryArray_ = java.util.Collections.unmodifiableList(valueFromQueryArray_); bitField0_ = (bitField0_ & ~0x00000002); } result.valueFromQueryArray_ = valueFromQueryArray_; if (cellToColumnMappingBuilder_ == null) { if (((bitField0_ & 0x00000004) == 0x00000004)) { cellToColumnMapping_ = java.util.Collections.unmodifiableList(cellToColumnMapping_); bitField0_ = (bitField0_ & ~0x00000004); } result.cellToColumnMapping_ = cellToColumnMapping_; } else { result.cellToColumnMapping_ = cellToColumnMappingBuilder_.build(); } result.bitField0_ = to_bitField0_; onBuilt(); return result; } public Builder mergeFrom(com.google.protobuf.Message other) { if (other instanceof org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter) { return mergeFrom((org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter)other); } else { super.mergeFrom(other); return this; } } public Builder mergeFrom(org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter other) { if (other == org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter.getDefaultInstance()) return this; if (other.hasDynamicLogicExpression()) { bitField0_ |= 0x00000001; dynamicLogicExpression_ = other.dynamicLogicExpression_; onChanged(); } if (!other.valueFromQueryArray_.isEmpty()) { if (valueFromQueryArray_.isEmpty()) { valueFromQueryArray_ = other.valueFromQueryArray_; bitField0_ = (bitField0_ & ~0x00000002); } else { ensureValueFromQueryArrayIsMutable(); valueFromQueryArray_.addAll(other.valueFromQueryArray_); } onChanged(); } if (cellToColumnMappingBuilder_ == null) { if (!other.cellToColumnMapping_.isEmpty()) { if (cellToColumnMapping_.isEmpty()) { cellToColumnMapping_ = other.cellToColumnMapping_; bitField0_ = (bitField0_ & ~0x00000004); } else { ensureCellToColumnMappingIsMutable(); cellToColumnMapping_.addAll(other.cellToColumnMapping_); } onChanged(); } } else { if (!other.cellToColumnMapping_.isEmpty()) { if (cellToColumnMappingBuilder_.isEmpty()) { cellToColumnMappingBuilder_.dispose(); cellToColumnMappingBuilder_ = null; cellToColumnMapping_ = other.cellToColumnMapping_; bitField0_ = (bitField0_ & ~0x00000004); cellToColumnMappingBuilder_ = com.google.protobuf.GeneratedMessage.alwaysUseFieldBuilders ? getCellToColumnMappingFieldBuilder() : null; } else { cellToColumnMappingBuilder_.addAllMessages(other.cellToColumnMapping_); } } } this.mergeUnknownFields(other.getUnknownFields()); return this; } public final boolean isInitialized() { if (!hasDynamicLogicExpression()) { return false; } for (int i = 0; i < getCellToColumnMappingCount(); i++) { if (!getCellToColumnMapping(i).isInitialized()) { return false; } } return true; } public Builder mergeFrom( com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry) throws java.io.IOException { org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter parsedMessage = null; try { parsedMessage = PARSER.parsePartialFrom(input, extensionRegistry); } catch (com.google.protobuf.InvalidProtocolBufferException e) { parsedMessage = (org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownFilter) e.getUnfinishedMessage(); throw e; } finally { if (parsedMessage != null) { mergeFrom(parsedMessage); } } return this; } private int bitField0_; // required string dynamic_logic_expression = 1; private java.lang.Object dynamicLogicExpression_ = ""; /** * required string dynamic_logic_expression = 1; */ public boolean hasDynamicLogicExpression() { return ((bitField0_ & 0x00000001) == 0x00000001); } /** * required string dynamic_logic_expression = 1; */ public java.lang.String getDynamicLogicExpression() { java.lang.Object ref = dynamicLogicExpression_; if (!(ref instanceof java.lang.String)) { java.lang.String s = ((com.google.protobuf.ByteString) ref) .toStringUtf8(); dynamicLogicExpression_ = s; return s; } else { return (java.lang.String) ref; } } /** * required string dynamic_logic_expression = 1; */ public com.google.protobuf.ByteString getDynamicLogicExpressionBytes() { java.lang.Object ref = dynamicLogicExpression_; if (ref instanceof String) { com.google.protobuf.ByteString b = com.google.protobuf.ByteString.copyFromUtf8( (java.lang.String) ref); dynamicLogicExpression_ = b; return b; } else { return (com.google.protobuf.ByteString) ref; } } /** * required string dynamic_logic_expression = 1; */ public Builder setDynamicLogicExpression( java.lang.String value) { if (value == null) { throw new NullPointerException(); } bitField0_ |= 0x00000001; dynamicLogicExpression_ = value; onChanged(); return this; } /** * required string dynamic_logic_expression = 1; */ public Builder clearDynamicLogicExpression() { bitField0_ = (bitField0_ & ~0x00000001); dynamicLogicExpression_ = getDefaultInstance().getDynamicLogicExpression(); onChanged(); return this; } /** * required string dynamic_logic_expression = 1; */ public Builder setDynamicLogicExpressionBytes( com.google.protobuf.ByteString value) { if (value == null) { throw new NullPointerException(); } bitField0_ |= 0x00000001; dynamicLogicExpression_ = value; onChanged(); return this; } // repeated bytes value_from_query_array = 2; private java.util.List valueFromQueryArray_ = java.util.Collections.emptyList(); private void ensureValueFromQueryArrayIsMutable() { if (!((bitField0_ & 0x00000002) == 0x00000002)) { valueFromQueryArray_ = new java.util.ArrayList(valueFromQueryArray_); bitField0_ |= 0x00000002; } } /** * repeated bytes value_from_query_array = 2; */ public java.util.List getValueFromQueryArrayList() { return java.util.Collections.unmodifiableList(valueFromQueryArray_); } /** * repeated bytes value_from_query_array = 2; */ public int getValueFromQueryArrayCount() { return valueFromQueryArray_.size(); } /** * repeated bytes value_from_query_array = 2; */ public com.google.protobuf.ByteString getValueFromQueryArray(int index) { return valueFromQueryArray_.get(index); } /** * repeated bytes value_from_query_array = 2; */ public Builder setValueFromQueryArray( int index, com.google.protobuf.ByteString value) { if (value == null) { throw new NullPointerException(); } ensureValueFromQueryArrayIsMutable(); valueFromQueryArray_.set(index, value); onChanged(); return this; } /** * repeated bytes value_from_query_array = 2; */ public Builder addValueFromQueryArray(com.google.protobuf.ByteString value) { if (value == null) { throw new NullPointerException(); } ensureValueFromQueryArrayIsMutable(); valueFromQueryArray_.add(value); onChanged(); return this; } /** * repeated bytes value_from_query_array = 2; */ public Builder addAllValueFromQueryArray( java.lang.Iterable values) { ensureValueFromQueryArrayIsMutable(); super.addAll(values, valueFromQueryArray_); onChanged(); return this; } /** * repeated bytes value_from_query_array = 2; */ public Builder clearValueFromQueryArray() { valueFromQueryArray_ = java.util.Collections.emptyList(); bitField0_ = (bitField0_ & ~0x00000002); onChanged(); return this; } // repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; private java.util.List cellToColumnMapping_ = java.util.Collections.emptyList(); private void ensureCellToColumnMappingIsMutable() { if (!((bitField0_ & 0x00000004) == 0x00000004)) { cellToColumnMapping_ = new java.util.ArrayList(cellToColumnMapping_); bitField0_ |= 0x00000004; } } private com.google.protobuf.RepeatedFieldBuilder< org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping, org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.Builder, org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMappingOrBuilder> cellToColumnMappingBuilder_; /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public java.util.List getCellToColumnMappingList() { if (cellToColumnMappingBuilder_ == null) { return java.util.Collections.unmodifiableList(cellToColumnMapping_); } else { return cellToColumnMappingBuilder_.getMessageList(); } } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public int getCellToColumnMappingCount() { if (cellToColumnMappingBuilder_ == null) { return cellToColumnMapping_.size(); } else { return cellToColumnMappingBuilder_.getCount(); } } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping getCellToColumnMapping(int index) { if (cellToColumnMappingBuilder_ == null) { return cellToColumnMapping_.get(index); } else { return cellToColumnMappingBuilder_.getMessage(index); } } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public Builder setCellToColumnMapping( int index, org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping value) { if (cellToColumnMappingBuilder_ == null) { if (value == null) { throw new NullPointerException(); } ensureCellToColumnMappingIsMutable(); cellToColumnMapping_.set(index, value); onChanged(); } else { cellToColumnMappingBuilder_.setMessage(index, value); } return this; } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public Builder setCellToColumnMapping( int index, org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.Builder builderForValue) { if (cellToColumnMappingBuilder_ == null) { ensureCellToColumnMappingIsMutable(); cellToColumnMapping_.set(index, builderForValue.build()); onChanged(); } else { cellToColumnMappingBuilder_.setMessage(index, builderForValue.build()); } return this; } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public Builder addCellToColumnMapping(org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping value) { if (cellToColumnMappingBuilder_ == null) { if (value == null) { throw new NullPointerException(); } ensureCellToColumnMappingIsMutable(); cellToColumnMapping_.add(value); onChanged(); } else { cellToColumnMappingBuilder_.addMessage(value); } return this; } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public Builder addCellToColumnMapping( int index, org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping value) { if (cellToColumnMappingBuilder_ == null) { if (value == null) { throw new NullPointerException(); } ensureCellToColumnMappingIsMutable(); cellToColumnMapping_.add(index, value); onChanged(); } else { cellToColumnMappingBuilder_.addMessage(index, value); } return this; } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public Builder addCellToColumnMapping( org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.Builder builderForValue) { if (cellToColumnMappingBuilder_ == null) { ensureCellToColumnMappingIsMutable(); cellToColumnMapping_.add(builderForValue.build()); onChanged(); } else { cellToColumnMappingBuilder_.addMessage(builderForValue.build()); } return this; } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public Builder addCellToColumnMapping( int index, org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.Builder builderForValue) { if (cellToColumnMappingBuilder_ == null) { ensureCellToColumnMappingIsMutable(); cellToColumnMapping_.add(index, builderForValue.build()); onChanged(); } else { cellToColumnMappingBuilder_.addMessage(index, builderForValue.build()); } return this; } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public Builder addAllCellToColumnMapping( java.lang.Iterable values) { if (cellToColumnMappingBuilder_ == null) { ensureCellToColumnMappingIsMutable(); super.addAll(values, cellToColumnMapping_); onChanged(); } else { cellToColumnMappingBuilder_.addAllMessages(values); } return this; } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public Builder clearCellToColumnMapping() { if (cellToColumnMappingBuilder_ == null) { cellToColumnMapping_ = java.util.Collections.emptyList(); bitField0_ = (bitField0_ & ~0x00000004); onChanged(); } else { cellToColumnMappingBuilder_.clear(); } return this; } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public Builder removeCellToColumnMapping(int index) { if (cellToColumnMappingBuilder_ == null) { ensureCellToColumnMappingIsMutable(); cellToColumnMapping_.remove(index); onChanged(); } else { cellToColumnMappingBuilder_.remove(index); } return this; } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.Builder getCellToColumnMappingBuilder( int index) { return getCellToColumnMappingFieldBuilder().getBuilder(index); } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMappingOrBuilder getCellToColumnMappingOrBuilder( int index) { if (cellToColumnMappingBuilder_ == null) { return cellToColumnMapping_.get(index); } else { return cellToColumnMappingBuilder_.getMessageOrBuilder(index); } } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public java.util.List getCellToColumnMappingOrBuilderList() { if (cellToColumnMappingBuilder_ != null) { return cellToColumnMappingBuilder_.getMessageOrBuilderList(); } else { return java.util.Collections.unmodifiableList(cellToColumnMapping_); } } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.Builder addCellToColumnMappingBuilder() { return getCellToColumnMappingFieldBuilder().addBuilder( org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.getDefaultInstance()); } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.Builder addCellToColumnMappingBuilder( int index) { return getCellToColumnMappingFieldBuilder().addBuilder( index, org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.getDefaultInstance()); } /** * repeated .hbase.pb.SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; */ public java.util.List getCellToColumnMappingBuilderList() { return getCellToColumnMappingFieldBuilder().getBuilderList(); } private com.google.protobuf.RepeatedFieldBuilder< org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping, org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.Builder, org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMappingOrBuilder> getCellToColumnMappingFieldBuilder() { if (cellToColumnMappingBuilder_ == null) { cellToColumnMappingBuilder_ = new com.google.protobuf.RepeatedFieldBuilder< org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping, org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMapping.Builder, org.apache.hadoop.hbase.spark.protobuf.generated.FilterProtos.SQLPredicatePushDownCellToColumnMappingOrBuilder>( cellToColumnMapping_, ((bitField0_ & 0x00000004) == 0x00000004), getParentForChildren(), isClean()); cellToColumnMapping_ = null; } return cellToColumnMappingBuilder_; } // @@protoc_insertion_point(builder_scope:hbase.pb.SQLPredicatePushDownFilter) } static { defaultInstance = new SQLPredicatePushDownFilter(true); defaultInstance.initFields(); } // @@protoc_insertion_point(class_scope:hbase.pb.SQLPredicatePushDownFilter) } private static com.google.protobuf.Descriptors.Descriptor internal_static_hbase_pb_SQLPredicatePushDownCellToColumnMapping_descriptor; private static com.google.protobuf.GeneratedMessage.FieldAccessorTable internal_static_hbase_pb_SQLPredicatePushDownCellToColumnMapping_fieldAccessorTable; private static com.google.protobuf.Descriptors.Descriptor internal_static_hbase_pb_SQLPredicatePushDownFilter_descriptor; private static com.google.protobuf.GeneratedMessage.FieldAccessorTable internal_static_hbase_pb_SQLPredicatePushDownFilter_fieldAccessorTable; public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() { return descriptor; } private static com.google.protobuf.Descriptors.FileDescriptor descriptor; static { java.lang.String[] descriptorData = { "\n\014Filter.proto\022\010hbase.pb\"h\n\'SQLPredicate" + "PushDownCellToColumnMapping\022\025\n\rcolumn_fa" + "mily\030\001 \002(\014\022\021\n\tqualifier\030\002 \002(\014\022\023\n\013column_" + "name\030\003 \002(\t\"\261\001\n\032SQLPredicatePushDownFilte" + "r\022 \n\030dynamic_logic_expression\030\001 \002(\t\022\036\n\026v" + "alue_from_query_array\030\002 \003(\014\022Q\n\026cell_to_c" + "olumn_mapping\030\003 \003(\01321.hbase.pb.SQLPredic" + "atePushDownCellToColumnMappingBH\n0org.ap" + "ache.hadoop.hbase.spark.protobuf.generat" + "edB\014FilterProtosH\001\210\001\001\240\001\001" }; com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() { public com.google.protobuf.ExtensionRegistry assignDescriptors( com.google.protobuf.Descriptors.FileDescriptor root) { descriptor = root; internal_static_hbase_pb_SQLPredicatePushDownCellToColumnMapping_descriptor = getDescriptor().getMessageTypes().get(0); internal_static_hbase_pb_SQLPredicatePushDownCellToColumnMapping_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable( internal_static_hbase_pb_SQLPredicatePushDownCellToColumnMapping_descriptor, new java.lang.String[] { "ColumnFamily", "Qualifier", "ColumnName", }); internal_static_hbase_pb_SQLPredicatePushDownFilter_descriptor = getDescriptor().getMessageTypes().get(1); internal_static_hbase_pb_SQLPredicatePushDownFilter_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable( internal_static_hbase_pb_SQLPredicatePushDownFilter_descriptor, new java.lang.String[] { "DynamicLogicExpression", "ValueFromQueryArray", "CellToColumnMapping", }); return null; } }; com.google.protobuf.Descriptors.FileDescriptor .internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[] { }, assigner); } // @@protoc_insertion_point(outer_class_scope) } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/protobuf/Filter.proto ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // This file contains protocol buffers that are used for filters package hbase.pb; option java_package = "org.apache.hadoop.hbase.spark.protobuf.generated"; option java_outer_classname = "FilterProtos"; option java_generic_services = true; option java_generate_equals_and_hash = true; option optimize_for = SPEED; message SQLPredicatePushDownCellToColumnMapping { required bytes column_family = 1; required bytes qualifier = 2; required string column_name = 3; } message SQLPredicatePushDownFilter { required string dynamic_logic_expression = 1; repeated bytes value_from_query_array = 2; repeated SQLPredicatePushDownCellToColumnMapping cell_to_column_mapping = 3; } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/BulkLoadPartitioner.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark import java.util import java.util.Comparator import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.Partitioner /** * A Partitioner implementation that will separate records to different * HBase Regions based on region splits * * @param startKeys The start keys for the given table */ class BulkLoadPartitioner(startKeys:Array[Array[Byte]]) extends Partitioner { override def numPartitions: Int = startKeys.length override def getPartition(key: Any): Int = { val comparator: Comparator[Array[Byte]] = new Comparator[Array[Byte]] { override def compare(o1: Array[Byte], o2: Array[Byte]): Int = { Bytes.compareTo(o1, o2) } } val rowKey:Array[Byte] = key match { case qualifier: KeyFamilyQualifier => qualifier.rowKey case wrapper: ByteArrayWrapper => wrapper.value case _ => key.asInstanceOf[Array[Byte]] } val partition = util.Arrays.binarySearch(startKeys, rowKey, comparator) if (partition < 0) partition * -1 + -2 else partition } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/ByteArrayComparable.scala ================================================ /* * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark import org.apache.hadoop.hbase.util.Bytes class ByteArrayComparable(val bytes:Array[Byte], val offset:Int = 0, var length:Int = -1) extends Comparable[ByteArrayComparable] { if (length == -1) { length = bytes.length } override def compareTo(o: ByteArrayComparable): Int = { Bytes.compareTo(bytes, offset, length, o.bytes, o.offset, o.length) } override def hashCode(): Int = { Bytes.hashCode(bytes, offset, length) } override def equals (obj: Any): Boolean = { obj match { case b: ByteArrayComparable => Bytes.equals(bytes, offset, length, b.bytes, b.offset, b.length) case _ => false } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/ByteArrayWrapper.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark import java.io.Serializable import org.apache.hadoop.hbase.util.Bytes /** * This is a wrapper over a byte array so it can work as * a key in a hashMap * * @param value The Byte Array value */ class ByteArrayWrapper (var value:Array[Byte]) extends Comparable[ByteArrayWrapper] with Serializable { override def compareTo(valueOther: ByteArrayWrapper): Int = { Bytes.compareTo(value,valueOther.value) } override def equals(o2: Any): Boolean = { o2 match { case wrapper: ByteArrayWrapper => Bytes.equals(value, wrapper.value) case _ => false } } override def hashCode():Int = { Bytes.hashCode(value) } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/ColumnFamilyQualifierMapKeyWrapper.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark import org.apache.hadoop.hbase.util.Bytes /** * A wrapper class that will allow both columnFamily and qualifier to * be the key of a hashMap. Also allow for finding the value in a hashmap * with out cloning the HBase value from the HBase Cell object * @param columnFamily ColumnFamily byte array * @param columnFamilyOffSet Offset of columnFamily value in the array * @param columnFamilyLength Length of the columnFamily value in the columnFamily array * @param qualifier Qualifier byte array * @param qualifierOffSet Offset of qualifier value in the array * @param qualifierLength Length of the qualifier value with in the array */ class ColumnFamilyQualifierMapKeyWrapper(val columnFamily:Array[Byte], val columnFamilyOffSet:Int, val columnFamilyLength:Int, val qualifier:Array[Byte], val qualifierOffSet:Int, val qualifierLength:Int) extends Serializable{ override def equals(other:Any): Boolean = { val otherWrapper = other.asInstanceOf[ColumnFamilyQualifierMapKeyWrapper] Bytes.compareTo(columnFamily, columnFamilyOffSet, columnFamilyLength, otherWrapper.columnFamily, otherWrapper.columnFamilyOffSet, otherWrapper.columnFamilyLength) == 0 && Bytes.compareTo(qualifier, qualifierOffSet, qualifierLength, otherWrapper.qualifier, otherWrapper.qualifierOffSet, otherWrapper.qualifierLength) == 0 } override def hashCode():Int = { Bytes.hashCode(columnFamily, columnFamilyOffSet, columnFamilyLength) + Bytes.hashCode(qualifier, qualifierOffSet, qualifierLength) } def cloneColumnFamily():Array[Byte] = { val resultArray = new Array[Byte](columnFamilyLength) System.arraycopy(columnFamily, columnFamilyOffSet, resultArray, 0, columnFamilyLength) resultArray } def cloneQualifier():Array[Byte] = { val resultArray = new Array[Byte](qualifierLength) System.arraycopy(qualifier, qualifierOffSet, resultArray, 0, qualifierLength) resultArray } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/DefaultSource.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark import java.util import java.util.concurrent.ConcurrentLinkedQueue import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.spark.datasources.HBaseSparkConf import org.apache.hadoop.hbase.spark.datasources.HBaseTableScanRDD import org.apache.hadoop.hbase.spark.datasources.SerializableConfiguration import org.apache.hadoop.hbase.types._ import org.apache.hadoop.hbase.util.{Bytes, PositionedByteRange, SimplePositionedMutableByteRange} import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ import scala.collection.mutable /** * DefaultSource for integration with Spark's dataframe datasources. * This class will produce a relationProvider based on input given to it from spark * * In all this DefaultSource support the following datasource functionality * - Scan range pruning through filter push down logic based on rowKeys * - Filter push down logic on HBase Cells * - Qualifier filtering based on columns used in the SparkSQL statement * - Type conversions of basic SQL types. All conversions will be * Through the HBase Bytes object commands. */ class DefaultSource extends RelationProvider with Logging { val TABLE_KEY:String = "hbase.table" val SCHEMA_COLUMNS_MAPPING_KEY:String = "hbase.columns.mapping" val HBASE_CONFIG_RESOURCES_LOCATIONS:String = "hbase.config.resources" val USE_HBASE_CONTEXT:String = "hbase.use.hbase.context" val PUSH_DOWN_COLUMN_FILTER:String = "hbase.push.down.column.filter" /** * Is given input from SparkSQL to construct a BaseRelation * @param sqlContext SparkSQL context * @param parameters Parameters given to us from SparkSQL * @return A BaseRelation Object */ override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val tableName = parameters.get(TABLE_KEY) if (tableName.isEmpty) new IllegalArgumentException("Invalid value for " + TABLE_KEY +" '" + tableName + "'") val schemaMappingString = parameters.getOrElse(SCHEMA_COLUMNS_MAPPING_KEY, "") val hbaseConfigResources = parameters.getOrElse(HBASE_CONFIG_RESOURCES_LOCATIONS, "") val useHBaseReources = parameters.getOrElse(USE_HBASE_CONTEXT, "true") val usePushDownColumnFilter = parameters.getOrElse(PUSH_DOWN_COLUMN_FILTER, "true") new HBaseRelation(tableName.get, generateSchemaMappingMap(schemaMappingString), hbaseConfigResources, useHBaseReources.equalsIgnoreCase("true"), usePushDownColumnFilter.equalsIgnoreCase("true"), parameters)(sqlContext) } /** * Reads the SCHEMA_COLUMNS_MAPPING_KEY and converts it to a map of * SchemaQualifierDefinitions with the original sql column name as the key * @param schemaMappingString The schema mapping string from the SparkSQL map * @return A map of definitions keyed by the SparkSQL column name */ def generateSchemaMappingMap(schemaMappingString:String): java.util.HashMap[String, SchemaQualifierDefinition] = { try { val columnDefinitions = schemaMappingString.split(',') val resultingMap = new java.util.HashMap[String, SchemaQualifierDefinition]() columnDefinitions.map(cd => { val parts = cd.trim.split(' ') //Make sure we get three parts // if (parts.length == 3) { val hbaseDefinitionParts = if (parts(2).charAt(0) == ':') { Array[String]("", "key") } else { parts(2).split(':') } resultingMap.put(parts(0), new SchemaQualifierDefinition(parts(0), parts(1), hbaseDefinitionParts(0), hbaseDefinitionParts(1))) } else { throw new IllegalArgumentException("Invalid value for schema mapping '" + cd + "' should be ' :' " + "for columns and ' :' for rowKeys") } }) resultingMap } catch { case e:Exception => throw new IllegalArgumentException("Invalid value for " + SCHEMA_COLUMNS_MAPPING_KEY + " '" + schemaMappingString + "'", e ) } } } /** * Implementation of Spark BaseRelation that will build up our scan logic * , do the scan pruning, filter push down, and value conversions * * @param tableName HBase table that we plan to read from * @param schemaMappingDefinition SchemaMapping information to map HBase * Qualifiers to SparkSQL columns * @param configResources Optional comma separated list of config resources * to get based on their URI * @param useHBaseContext If true this will look to see if * HBaseContext.latest is populated to use that * connection information * @param sqlContext SparkSQL context */ case class HBaseRelation (val tableName:String, val schemaMappingDefinition: java.util.HashMap[String, SchemaQualifierDefinition], val configResources:String, val useHBaseContext:Boolean, val usePushDownColumnFilter:Boolean, @transient parameters: Map[String, String] ) ( @transient val sqlContext:SQLContext) extends BaseRelation with PrunedFilteredScan with Logging { // The user supplied per table parameter will overwrite global ones in SparkConf val blockCacheEnable = parameters.get(HBaseSparkConf.BLOCK_CACHE_ENABLE).map(_.toBoolean) .getOrElse( sqlContext.sparkContext.getConf.getBoolean( HBaseSparkConf.BLOCK_CACHE_ENABLE, HBaseSparkConf.defaultBlockCacheEnable)) val cacheSize = parameters.get(HBaseSparkConf.CACHE_SIZE).map(_.toInt) .getOrElse( sqlContext.sparkContext.getConf.getInt( HBaseSparkConf.CACHE_SIZE, HBaseSparkConf.defaultCachingSize)) val batchNum = parameters.get(HBaseSparkConf.BATCH_NUM).map(_.toInt) .getOrElse(sqlContext.sparkContext.getConf.getInt( HBaseSparkConf.BATCH_NUM, HBaseSparkConf.defaultBatchNum)) val bulkGetSize = parameters.get(HBaseSparkConf.BULKGET_SIZE).map(_.toInt) .getOrElse(sqlContext.sparkContext.getConf.getInt( HBaseSparkConf.BULKGET_SIZE, HBaseSparkConf.defaultBulkGetSize)) //create or get latest HBaseContext val hbaseContext:HBaseContext = if (useHBaseContext) { LatestHBaseContextCache.latest } else { val config = HBaseConfiguration.create() configResources.split(",").foreach( r => config.addResource(r)) new HBaseContext(sqlContext.sparkContext, config) } val wrappedConf = new SerializableConfiguration(hbaseContext.config) def hbaseConf = wrappedConf.value /** * Generates a Spark SQL schema object so Spark SQL knows what is being * provided by this BaseRelation * * @return schema generated from the SCHEMA_COLUMNS_MAPPING_KEY value */ override def schema: StructType = { val metadataBuilder = new MetadataBuilder() val structFieldArray = new Array[StructField](schemaMappingDefinition.size()) val schemaMappingDefinitionIt = schemaMappingDefinition.values().iterator() var indexCounter = 0 while (schemaMappingDefinitionIt.hasNext) { val c = schemaMappingDefinitionIt.next() val metadata = metadataBuilder.putString("name", c.columnName).build() val structField = new StructField(c.columnName, c.columnSparkSqlType, nullable = true, metadata) structFieldArray(indexCounter) = structField indexCounter += 1 } val result = new StructType(structFieldArray) result } /** * Here we are building the functionality to populate the resulting RDD[Row] * Here is where we will do the following: * - Filter push down * - Scan or GetList pruning * - Executing our scan(s) or/and GetList to generate result * * @param requiredColumns The columns that are being requested by the requesting query * @param filters The filters that are being applied by the requesting query * @return RDD will all the results from HBase needed for SparkSQL to * execute the query on */ override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { val pushDownTuple = buildPushDownPredicatesResource(filters) val pushDownRowKeyFilter = pushDownTuple._1 var pushDownDynamicLogicExpression = pushDownTuple._2 val valueArray = pushDownTuple._3 if (!usePushDownColumnFilter) { pushDownDynamicLogicExpression = null } logDebug("pushDownRowKeyFilter: " + pushDownRowKeyFilter.ranges) if (pushDownDynamicLogicExpression != null) { logDebug("pushDownDynamicLogicExpression: " + pushDownDynamicLogicExpression.toExpressionString) } logDebug("valueArray: " + valueArray.length) val requiredQualifierDefinitionList = new mutable.MutableList[SchemaQualifierDefinition] requiredColumns.foreach( c => { val definition = schemaMappingDefinition.get(c) requiredQualifierDefinitionList += definition }) //Create a local variable so that scala doesn't have to // serialize the whole HBaseRelation Object val serializableDefinitionMap = schemaMappingDefinition //retain the information for unit testing checks DefaultSourceStaticUtils.populateLatestExecutionRules(pushDownRowKeyFilter, pushDownDynamicLogicExpression) val getList = new util.ArrayList[Get]() val rddList = new util.ArrayList[RDD[Row]]() //add points to getList pushDownRowKeyFilter.points.foreach(p => { val get = new Get(p) requiredQualifierDefinitionList.foreach( d => { if (d.columnFamilyBytes.length > 0) get.addColumn(d.columnFamilyBytes, d.qualifierBytes) }) getList.add(get) }) val pushDownFilterJava = if (usePushDownColumnFilter && pushDownDynamicLogicExpression != null) { Some(new SparkSQLPushDownFilter(pushDownDynamicLogicExpression, valueArray, requiredQualifierDefinitionList)) } else { None } val hRdd = new HBaseTableScanRDD(this, hbaseContext, pushDownFilterJava, requiredQualifierDefinitionList.seq) pushDownRowKeyFilter.points.foreach(hRdd.addPoint(_)) pushDownRowKeyFilter.ranges.foreach(hRdd.addRange(_)) var resultRDD: RDD[Row] = { val tmp = hRdd.map{ r => Row.fromSeq(requiredColumns.map(c => DefaultSourceStaticUtils.getValue(c, serializableDefinitionMap, r))) } if (tmp.partitions.size > 0) { tmp } else { null } } if (resultRDD == null) { val scan = new Scan() scan.setCacheBlocks(blockCacheEnable) scan.setBatch(batchNum) scan.setCaching(cacheSize) requiredQualifierDefinitionList.foreach( d => scan.addColumn(d.columnFamilyBytes, d.qualifierBytes)) val rdd = hbaseContext.hbaseRDD(TableName.valueOf(tableName), scan).map(r => { Row.fromSeq(requiredColumns.map(c => DefaultSourceStaticUtils.getValue(c, serializableDefinitionMap, r._2))) }) resultRDD=rdd } resultRDD } def buildPushDownPredicatesResource(filters: Array[Filter]): (RowKeyFilter, DynamicLogicExpression, Array[Array[Byte]]) = { var superRowKeyFilter:RowKeyFilter = null val queryValueList = new mutable.MutableList[Array[Byte]] var superDynamicLogicExpression: DynamicLogicExpression = null filters.foreach( f => { val rowKeyFilter = new RowKeyFilter() val logicExpression = transverseFilterTree(rowKeyFilter, queryValueList, f) if (superDynamicLogicExpression == null) { superDynamicLogicExpression = logicExpression superRowKeyFilter = rowKeyFilter } else { superDynamicLogicExpression = new AndLogicExpression(superDynamicLogicExpression, logicExpression) superRowKeyFilter.mergeIntersect(rowKeyFilter) } }) val queryValueArray = queryValueList.toArray if (superRowKeyFilter == null) { superRowKeyFilter = new RowKeyFilter } (superRowKeyFilter, superDynamicLogicExpression, queryValueArray) } def transverseFilterTree(parentRowKeyFilter:RowKeyFilter, valueArray:mutable.MutableList[Array[Byte]], filter:Filter): DynamicLogicExpression = { filter match { case EqualTo(attr, value) => val columnDefinition = schemaMappingDefinition.get(attr) if (columnDefinition != null) { if (columnDefinition.columnFamily.isEmpty) { parentRowKeyFilter.mergeIntersect(new RowKeyFilter( DefaultSourceStaticUtils.getByteValue(attr, schemaMappingDefinition, value.toString), null)) } val byteValue = DefaultSourceStaticUtils.getByteValue(attr, schemaMappingDefinition, value.toString) valueArray += byteValue } new EqualLogicExpression(attr, valueArray.length - 1, false) case LessThan(attr, value) => val columnDefinition = schemaMappingDefinition.get(attr) if (columnDefinition != null) { if (columnDefinition.columnFamily.isEmpty) { parentRowKeyFilter.mergeIntersect(new RowKeyFilter(null, new ScanRange(DefaultSourceStaticUtils.getByteValue(attr, schemaMappingDefinition, value.toString), false, new Array[Byte](0), true))) } val byteValue = DefaultSourceStaticUtils.getByteValue(attr, schemaMappingDefinition, value.toString) valueArray += byteValue } new LessThanLogicExpression(attr, valueArray.length - 1) case GreaterThan(attr, value) => val columnDefinition = schemaMappingDefinition.get(attr) if (columnDefinition != null) { if (columnDefinition.columnFamily.isEmpty) { parentRowKeyFilter.mergeIntersect(new RowKeyFilter(null, new ScanRange(null, true, DefaultSourceStaticUtils.getByteValue(attr, schemaMappingDefinition, value.toString), false))) } val byteValue = DefaultSourceStaticUtils.getByteValue(attr, schemaMappingDefinition, value.toString) valueArray += byteValue } new GreaterThanLogicExpression(attr, valueArray.length - 1) case LessThanOrEqual(attr, value) => val columnDefinition = schemaMappingDefinition.get(attr) if (columnDefinition != null) { if (columnDefinition.columnFamily.isEmpty) { parentRowKeyFilter.mergeIntersect(new RowKeyFilter(null, new ScanRange(DefaultSourceStaticUtils.getByteValue(attr, schemaMappingDefinition, value.toString), true, new Array[Byte](0), true))) } val byteValue = DefaultSourceStaticUtils.getByteValue(attr, schemaMappingDefinition, value.toString) valueArray += byteValue } new LessThanOrEqualLogicExpression(attr, valueArray.length - 1) case GreaterThanOrEqual(attr, value) => val columnDefinition = schemaMappingDefinition.get(attr) if (columnDefinition != null) { if (columnDefinition.columnFamily.isEmpty) { parentRowKeyFilter.mergeIntersect(new RowKeyFilter(null, new ScanRange(null, true, DefaultSourceStaticUtils.getByteValue(attr, schemaMappingDefinition, value.toString), true))) } val byteValue = DefaultSourceStaticUtils.getByteValue(attr, schemaMappingDefinition, value.toString) valueArray += byteValue } new GreaterThanOrEqualLogicExpression(attr, valueArray.length - 1) case Or(left, right) => val leftExpression = transverseFilterTree(parentRowKeyFilter, valueArray, left) val rightSideRowKeyFilter = new RowKeyFilter val rightExpression = transverseFilterTree(rightSideRowKeyFilter, valueArray, right) parentRowKeyFilter.mergeUnion(rightSideRowKeyFilter) new OrLogicExpression(leftExpression, rightExpression) case And(left, right) => val leftExpression = transverseFilterTree(parentRowKeyFilter, valueArray, left) val rightSideRowKeyFilter = new RowKeyFilter val rightExpression = transverseFilterTree(rightSideRowKeyFilter, valueArray, right) parentRowKeyFilter.mergeIntersect(rightSideRowKeyFilter) new AndLogicExpression(leftExpression, rightExpression) case IsNull(attr) => new IsNullLogicExpression(attr, false) case IsNotNull(attr) => new IsNullLogicExpression(attr, true) case _ => new PassThroughLogicExpression } } } /** * Construct to contains column data that spend SparkSQL and HBase * * @param columnName SparkSQL column name * @param colType SparkSQL column type * @param columnFamily HBase column family * @param qualifier HBase qualifier name */ case class SchemaQualifierDefinition(columnName:String, colType:String, columnFamily:String, qualifier:String) extends Serializable { val columnFamilyBytes = Bytes.toBytes(columnFamily) val qualifierBytes = Bytes.toBytes(qualifier) val columnSparkSqlType:DataType = if (colType.equals("BOOLEAN")) BooleanType else if (colType.equals("TINYINT")) IntegerType else if (colType.equals("INT")) IntegerType else if (colType.equals("BIGINT")) LongType else if (colType.equals("FLOAT")) FloatType else if (colType.equals("DOUBLE")) DoubleType else if (colType.equals("STRING")) StringType else if (colType.equals("TIMESTAMP")) TimestampType else if (colType.equals("DECIMAL")) StringType else throw new IllegalArgumentException("Unsupported column type :" + colType) } /** * Construct to contain a single scan ranges information. Also * provide functions to merge with other scan ranges through AND * or OR operators * * @param upperBound Upper bound of scan * @param isUpperBoundEqualTo Include upper bound value in the results * @param lowerBound Lower bound of scan * @param isLowerBoundEqualTo Include lower bound value in the results */ class ScanRange(var upperBound:Array[Byte], var isUpperBoundEqualTo:Boolean, var lowerBound:Array[Byte], var isLowerBoundEqualTo:Boolean) extends Serializable { /** * Function to merge another scan object through a AND operation * @param other Other scan object */ def mergeIntersect(other:ScanRange): Unit = { val upperBoundCompare = compareRange(upperBound, other.upperBound) val lowerBoundCompare = compareRange(lowerBound, other.lowerBound) upperBound = if (upperBoundCompare <0) upperBound else other.upperBound lowerBound = if (lowerBoundCompare >0) lowerBound else other.lowerBound isLowerBoundEqualTo = if (lowerBoundCompare == 0) isLowerBoundEqualTo && other.isLowerBoundEqualTo else isLowerBoundEqualTo isUpperBoundEqualTo = if (upperBoundCompare == 0) isUpperBoundEqualTo && other.isUpperBoundEqualTo else isUpperBoundEqualTo } /** * Function to merge another scan object through a OR operation * @param other Other scan object */ def mergeUnion(other:ScanRange): Unit = { val upperBoundCompare = compareRange(upperBound, other.upperBound) val lowerBoundCompare = compareRange(lowerBound, other.lowerBound) upperBound = if (upperBoundCompare >0) upperBound else other.upperBound lowerBound = if (lowerBoundCompare <0) lowerBound else other.lowerBound isLowerBoundEqualTo = if (lowerBoundCompare == 0) isLowerBoundEqualTo || other.isLowerBoundEqualTo else if (lowerBoundCompare < 0) isLowerBoundEqualTo else other.isLowerBoundEqualTo isUpperBoundEqualTo = if (upperBoundCompare == 0) isUpperBoundEqualTo || other.isUpperBoundEqualTo else if (upperBoundCompare < 0) other.isUpperBoundEqualTo else isUpperBoundEqualTo } /** * Common function to see if this scan over laps with another * * Reference Visual * * A B * |---------------------------| * LL--------------LU * RL--------------RU * * A = lowest value is byte[0] * B = highest value is null * LL = Left Lower Bound * LU = Left Upper Bound * RL = Right Lower Bound * RU = Right Upper Bound * * @param other Other scan object * @return True is overlap false is not overlap */ def getOverLapScanRange(other:ScanRange): ScanRange = { var leftRange:ScanRange = null var rightRange:ScanRange = null //First identify the Left range // Also lower bound can't be null if (compareRange(lowerBound, other.lowerBound) < 0 || compareRange(upperBound, other.upperBound) < 0) { leftRange = this rightRange = other } else { leftRange = other rightRange = this } //Then see if leftRange goes to null or if leftRange.upperBound // upper is greater or equals to rightRange.lowerBound if (leftRange.upperBound == null || Bytes.compareTo(leftRange.upperBound, rightRange.lowerBound) >= 0) { new ScanRange(leftRange.upperBound, leftRange.isUpperBoundEqualTo, rightRange.lowerBound, rightRange.isLowerBoundEqualTo) } else { null } } /** * Special compare logic because we can have null values * for left or right bound * * @param left Left byte array * @param right Right byte array * @return 0 for equals 1 is left is greater and -1 is right is greater */ def compareRange(left:Array[Byte], right:Array[Byte]): Int = { if (left == null && right == null) 0 else if (left == null && right != null) 1 else if (left != null && right == null) -1 else Bytes.compareTo(left, right) } /** * * @return */ def containsPoint(point:Array[Byte]): Boolean = { val lowerCompare = compareRange(point, lowerBound) val upperCompare = compareRange(point, upperBound) ((isLowerBoundEqualTo && lowerCompare >= 0) || (!isLowerBoundEqualTo && lowerCompare > 0)) && ((isUpperBoundEqualTo && upperCompare <= 0) || (!isUpperBoundEqualTo && upperCompare < 0)) } override def toString:String = { "ScanRange:(upperBound:" + Bytes.toString(upperBound) + ",isUpperBoundEqualTo:" + isUpperBoundEqualTo + ",lowerBound:" + Bytes.toString(lowerBound) + ",isLowerBoundEqualTo:" + isLowerBoundEqualTo + ")" } } /** * Contains information related to a filters for a given column. * This can contain many ranges or points. * * @param currentPoint the initial point when the filter is created * @param currentRange the initial scanRange when the filter is created */ class ColumnFilter (currentPoint:Array[Byte] = null, currentRange:ScanRange = null, var points:mutable.MutableList[Array[Byte]] = new mutable.MutableList[Array[Byte]](), var ranges:mutable.MutableList[ScanRange] = new mutable.MutableList[ScanRange]() ) extends Serializable { //Collection of ranges if (currentRange != null ) ranges.+=(currentRange) //Collection of points if (currentPoint != null) points.+=(currentPoint) /** * This will validate a give value through the filter's points and/or ranges * the result will be if the value passed the filter * * @param value Value to be validated * @param valueOffSet The offset of the value * @param valueLength The length of the value * @return True is the value passes the filter false if not */ def validate(value:Array[Byte], valueOffSet:Int, valueLength:Int):Boolean = { var result = false points.foreach( p => { if (Bytes.equals(p, 0, p.length, value, valueOffSet, valueLength)) { result = true } }) ranges.foreach( r => { val upperBoundPass = r.upperBound == null || (r.isUpperBoundEqualTo && Bytes.compareTo(r.upperBound, 0, r.upperBound.length, value, valueOffSet, valueLength) >= 0) || (!r.isUpperBoundEqualTo && Bytes.compareTo(r.upperBound, 0, r.upperBound.length, value, valueOffSet, valueLength) > 0) val lowerBoundPass = r.lowerBound == null || r.lowerBound.length == 0 (r.isLowerBoundEqualTo && Bytes.compareTo(r.lowerBound, 0, r.lowerBound.length, value, valueOffSet, valueLength) <= 0) || (!r.isLowerBoundEqualTo && Bytes.compareTo(r.lowerBound, 0, r.lowerBound.length, value, valueOffSet, valueLength) < 0) result = result || (upperBoundPass && lowerBoundPass) }) result } /** * This will allow us to merge filter logic that is joined to the existing filter * through a OR operator * * @param other Filter to merge */ def mergeUnion(other:ColumnFilter): Unit = { other.points.foreach( p => points += p) other.ranges.foreach( otherR => { var doesOverLap = false ranges.foreach{ r => if (r.getOverLapScanRange(otherR) != null) { r.mergeUnion(otherR) doesOverLap = true }} if (!doesOverLap) ranges.+=(otherR) }) } /** * This will allow us to merge filter logic that is joined to the existing filter * through a AND operator * * @param other Filter to merge */ def mergeIntersect(other:ColumnFilter): Unit = { val survivingPoints = new mutable.MutableList[Array[Byte]]() points.foreach( p => { other.points.foreach( otherP => { if (Bytes.equals(p, otherP)) { survivingPoints.+=(p) } }) }) points = survivingPoints val survivingRanges = new mutable.MutableList[ScanRange]() other.ranges.foreach( otherR => { ranges.foreach( r => { if (r.getOverLapScanRange(otherR) != null) { r.mergeIntersect(otherR) survivingRanges += r } }) }) ranges = survivingRanges } override def toString:String = { val strBuilder = new StringBuilder strBuilder.append("(points:(") var isFirst = true points.foreach( p => { if (isFirst) isFirst = false else strBuilder.append(",") strBuilder.append(Bytes.toString(p)) }) strBuilder.append("),ranges:") isFirst = true ranges.foreach( r => { if (isFirst) isFirst = false else strBuilder.append(",") strBuilder.append(r) }) strBuilder.append("))") strBuilder.toString() } } /** * A collection of ColumnFilters indexed by column names. * * Also contains merge commends that will consolidate the filters * per column name */ class ColumnFilterCollection { val columnFilterMap = new mutable.HashMap[String, ColumnFilter] def clear(): Unit = { columnFilterMap.clear() } /** * This will allow us to merge filter logic that is joined to the existing filter * through a OR operator. This will merge a single columns filter * * @param column The column to be merged * @param other The other ColumnFilter object to merge */ def mergeUnion(column:String, other:ColumnFilter): Unit = { val existingFilter = columnFilterMap.get(column) if (existingFilter.isEmpty) { columnFilterMap.+=((column, other)) } else { existingFilter.get.mergeUnion(other) } } /** * This will allow us to merge all filters in the existing collection * to the filters in the other collection. All merges are done as a result * of a OR operator * * @param other The other Column Filter Collection to be merged */ def mergeUnion(other:ColumnFilterCollection): Unit = { other.columnFilterMap.foreach( e => { mergeUnion(e._1, e._2) }) } /** * This will allow us to merge all filters in the existing collection * to the filters in the other collection. All merges are done as a result * of a AND operator * * @param other The column filter from the other collection */ def mergeIntersect(other:ColumnFilterCollection): Unit = { other.columnFilterMap.foreach( e => { val existingColumnFilter = columnFilterMap.get(e._1) if (existingColumnFilter.isEmpty) { columnFilterMap += e } else { existingColumnFilter.get.mergeIntersect(e._2) } }) } /** * This will collect all the filter information in a way that is optimized * for the HBase filter commend. Allowing the filter to be accessed * with columnFamily and qualifier information * * @param schemaDefinitionMap Schema Map that will help us map the right filters * to the correct columns * @return HashMap oc column filters */ def generateFamilyQualifiterFilterMap(schemaDefinitionMap: java.util.HashMap[String, SchemaQualifierDefinition]): util.HashMap[ColumnFamilyQualifierMapKeyWrapper, ColumnFilter] = { val familyQualifierFilterMap = new util.HashMap[ColumnFamilyQualifierMapKeyWrapper, ColumnFilter]() columnFilterMap.foreach( e => { val definition = schemaDefinitionMap.get(e._1) //Don't add rowKeyFilter if (definition.columnFamilyBytes.size > 0) { familyQualifierFilterMap.put( new ColumnFamilyQualifierMapKeyWrapper( definition.columnFamilyBytes, 0, definition.columnFamilyBytes.length, definition.qualifierBytes, 0, definition.qualifierBytes.length), e._2) } }) familyQualifierFilterMap } override def toString:String = { val strBuilder = new StringBuilder columnFilterMap.foreach( e => strBuilder.append(e)) strBuilder.toString() } } /** * Status object to store static functions but also to hold last executed * information that can be used for unit testing. */ object DefaultSourceStaticUtils { val rawInteger = new RawInteger val rawLong = new RawLong val rawFloat = new RawFloat val rawDouble = new RawDouble val rawString = RawString.ASCENDING val byteRange = new ThreadLocal[PositionedByteRange]{ override def initialValue(): PositionedByteRange = { val range = new SimplePositionedMutableByteRange() range.setOffset(0) range.setPosition(0) } } def getFreshByteRange(bytes:Array[Byte]): PositionedByteRange = { getFreshByteRange(bytes, 0, bytes.length) } def getFreshByteRange(bytes:Array[Byte], offset:Int = 0, length:Int): PositionedByteRange = { byteRange.get().set(bytes).setLength(length).setOffset(offset) } //This will contain the last 5 filters and required fields used in buildScan // These values can be used in unit testing to make sure we are converting // The Spark SQL input correctly val lastFiveExecutionRules = new ConcurrentLinkedQueue[ExecutionRuleForUnitTesting]() /** * This method is to populate the lastFiveExecutionRules for unit test perposes * This method is not thread safe. * * @param rowKeyFilter The rowKey Filter logic used in the last query * @param dynamicLogicExpression The dynamicLogicExpression used in the last query */ def populateLatestExecutionRules(rowKeyFilter: RowKeyFilter, dynamicLogicExpression: DynamicLogicExpression):Unit = { lastFiveExecutionRules.add(new ExecutionRuleForUnitTesting( rowKeyFilter, dynamicLogicExpression)) while (lastFiveExecutionRules.size() > 5) { lastFiveExecutionRules.poll() } } /** * This method will convert the result content from HBase into the * SQL value type that is requested by the Spark SQL schema definition * * @param columnName The name of the SparkSQL Column * @param schemaMappingDefinition The schema definition map * @param r The result object from HBase * @return The converted object type */ def getValue(columnName: String, schemaMappingDefinition: java.util.HashMap[String, SchemaQualifierDefinition], r: Result): Any = { val columnDef = schemaMappingDefinition.get(columnName) if (columnDef == null) throw new IllegalArgumentException("Unknown column:" + columnName) if (columnDef.columnFamilyBytes.isEmpty) { val row = r.getRow columnDef.columnSparkSqlType match { case IntegerType => rawInteger.decode(getFreshByteRange(row)) case LongType => rawLong.decode(getFreshByteRange(row)) case FloatType => rawFloat.decode(getFreshByteRange(row)) case DoubleType => rawDouble.decode(getFreshByteRange(row)) case StringType => rawString.decode(getFreshByteRange(row)) case TimestampType => rawLong.decode(getFreshByteRange(row)) case _ => Bytes.toString(row) } } else { val cellByteValue = r.getColumnLatestCell(columnDef.columnFamilyBytes, columnDef.qualifierBytes) if (cellByteValue == null) null else columnDef.columnSparkSqlType match { case IntegerType => rawInteger.decode(getFreshByteRange(cellByteValue.getValueArray, cellByteValue.getValueOffset, cellByteValue.getValueLength)) case LongType => rawLong.decode(getFreshByteRange(cellByteValue.getValueArray, cellByteValue.getValueOffset, cellByteValue.getValueLength)) case FloatType => rawFloat.decode(getFreshByteRange(cellByteValue.getValueArray, cellByteValue.getValueOffset, cellByteValue.getValueLength)) case DoubleType => rawDouble.decode(getFreshByteRange(cellByteValue.getValueArray, cellByteValue.getValueOffset, cellByteValue.getValueLength)) case StringType => Bytes.toString(cellByteValue.getValueArray, cellByteValue.getValueOffset, cellByteValue.getValueLength) case TimestampType => rawLong.decode(getFreshByteRange(cellByteValue.getValueArray, cellByteValue.getValueOffset, cellByteValue.getValueLength)) case _ => Bytes.toString(cellByteValue.getValueArray, cellByteValue.getValueOffset, cellByteValue.getValueLength) } } } /** * This will convert the value from SparkSQL to be stored into HBase using the * right byte Type * * @param columnName SparkSQL column name * @param schemaMappingDefinition Schema definition map * @param value String value from SparkSQL * @return Returns the byte array to go into HBase */ def getByteValue(columnName: String, schemaMappingDefinition: java.util.HashMap[String, SchemaQualifierDefinition], value: String): Array[Byte] = { val columnDef = schemaMappingDefinition.get(columnName) if (columnDef == null) { throw new IllegalArgumentException("Unknown column:" + columnName) } else { columnDef.columnSparkSqlType match { case IntegerType => val result = new Array[Byte](Bytes.SIZEOF_INT) val localDataRange = getFreshByteRange(result) rawInteger.encode(localDataRange, value.toInt) localDataRange.getBytes case LongType => val result = new Array[Byte](Bytes.SIZEOF_LONG) val localDataRange = getFreshByteRange(result) rawLong.encode(localDataRange, value.toLong) localDataRange.getBytes case FloatType => val result = new Array[Byte](Bytes.SIZEOF_FLOAT) val localDataRange = getFreshByteRange(result) rawFloat.encode(localDataRange, value.toFloat) localDataRange.getBytes case DoubleType => val result = new Array[Byte](Bytes.SIZEOF_DOUBLE) val localDataRange = getFreshByteRange(result) rawDouble.encode(localDataRange, value.toDouble) localDataRange.getBytes case StringType => Bytes.toBytes(value) case TimestampType => val result = new Array[Byte](Bytes.SIZEOF_LONG) val localDataRange = getFreshByteRange(result) rawLong.encode(localDataRange, value.toLong) localDataRange.getBytes case _ => Bytes.toBytes(value) } } } } /** * Contains information related to a filters for a given column. * This can contain many ranges or points. * * @param currentPoint the initial point when the filter is created * @param currentRange the initial scanRange when the filter is created */ class RowKeyFilter (currentPoint:Array[Byte] = null, currentRange:ScanRange = new ScanRange(null, true, new Array[Byte](0), true), var points:mutable.MutableList[Array[Byte]] = new mutable.MutableList[Array[Byte]](), var ranges:mutable.MutableList[ScanRange] = new mutable.MutableList[ScanRange]() ) extends Serializable { //Collection of ranges if (currentRange != null ) ranges.+=(currentRange) //Collection of points if (currentPoint != null) points.+=(currentPoint) /** * This will validate a give value through the filter's points and/or ranges * the result will be if the value passed the filter * * @param value Value to be validated * @param valueOffSet The offset of the value * @param valueLength The length of the value * @return True is the value passes the filter false if not */ def validate(value:Array[Byte], valueOffSet:Int, valueLength:Int):Boolean = { var result = false points.foreach( p => { if (Bytes.equals(p, 0, p.length, value, valueOffSet, valueLength)) { result = true } }) ranges.foreach( r => { val upperBoundPass = r.upperBound == null || (r.isUpperBoundEqualTo && Bytes.compareTo(r.upperBound, 0, r.upperBound.length, value, valueOffSet, valueLength) >= 0) || (!r.isUpperBoundEqualTo && Bytes.compareTo(r.upperBound, 0, r.upperBound.length, value, valueOffSet, valueLength) > 0) val lowerBoundPass = r.lowerBound == null || r.lowerBound.length == 0 (r.isLowerBoundEqualTo && Bytes.compareTo(r.lowerBound, 0, r.lowerBound.length, value, valueOffSet, valueLength) <= 0) || (!r.isLowerBoundEqualTo && Bytes.compareTo(r.lowerBound, 0, r.lowerBound.length, value, valueOffSet, valueLength) < 0) result = result || (upperBoundPass && lowerBoundPass) }) result } /** * This will allow us to merge filter logic that is joined to the existing filter * through a OR operator * * @param other Filter to merge */ def mergeUnion(other:RowKeyFilter): Unit = { other.points.foreach( p => points += p) other.ranges.foreach( otherR => { var doesOverLap = false ranges.foreach{ r => if (r.getOverLapScanRange(otherR) != null) { r.mergeUnion(otherR) doesOverLap = true }} if (!doesOverLap) ranges.+=(otherR) }) } /** * This will allow us to merge filter logic that is joined to the existing filter * through a AND operator * * @param other Filter to merge */ def mergeIntersect(other:RowKeyFilter): Unit = { val survivingPoints = new mutable.MutableList[Array[Byte]]() val didntSurviveFirstPassPoints = new mutable.MutableList[Array[Byte]]() if (points == null || points.length == 0) { other.points.foreach( otherP => { didntSurviveFirstPassPoints += otherP }) } else { points.foreach(p => { if (other.points.length == 0) { didntSurviveFirstPassPoints += p } else { other.points.foreach(otherP => { if (Bytes.equals(p, otherP)) { survivingPoints += p } else { didntSurviveFirstPassPoints += p } }) } }) } val survivingRanges = new mutable.MutableList[ScanRange]() if (ranges.length == 0) { didntSurviveFirstPassPoints.foreach(p => { survivingPoints += p }) } else { ranges.foreach(r => { other.ranges.foreach(otherR => { val overLapScanRange = r.getOverLapScanRange(otherR) if (overLapScanRange != null) { survivingRanges += overLapScanRange } }) didntSurviveFirstPassPoints.foreach(p => { if (r.containsPoint(p)) { survivingPoints += p } }) }) } points = survivingPoints ranges = survivingRanges } override def toString:String = { val strBuilder = new StringBuilder strBuilder.append("(points:(") var isFirst = true points.foreach( p => { if (isFirst) isFirst = false else strBuilder.append(",") strBuilder.append(Bytes.toString(p)) }) strBuilder.append("),ranges:") isFirst = true ranges.foreach( r => { if (isFirst) isFirst = false else strBuilder.append(",") strBuilder.append(r) }) strBuilder.append("))") strBuilder.toString() } } class ExecutionRuleForUnitTesting(val rowKeyFilter: RowKeyFilter, val dynamicLogicExpression: DynamicLogicExpression) ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/DynamicLogicExpression.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark import java.util import org.apache.hadoop.hbase.util.Bytes /** * Dynamic logic for SQL push down logic there is an instance for most * common operations and a pass through for other operations not covered here * * Logic can be nested with And or Or operators. * * A logic tree can be written out as a string and reconstructed from that string * */ trait DynamicLogicExpression { def execute(columnToCurrentRowValueMap: util.HashMap[String, ByteArrayComparable], valueFromQueryValueArray:Array[Array[Byte]]): Boolean def toExpressionString: String = { val strBuilder = new StringBuilder appendToExpression(strBuilder) strBuilder.toString() } def appendToExpression(strBuilder:StringBuilder) } class AndLogicExpression (val leftExpression:DynamicLogicExpression, val rightExpression:DynamicLogicExpression) extends DynamicLogicExpression{ override def execute(columnToCurrentRowValueMap: util.HashMap[String, ByteArrayComparable], valueFromQueryValueArray:Array[Array[Byte]]): Boolean = { leftExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray) && rightExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray) } override def appendToExpression(strBuilder: StringBuilder): Unit = { strBuilder.append("( ") strBuilder.append(leftExpression.toExpressionString) strBuilder.append(" AND ") strBuilder.append(rightExpression.toExpressionString) strBuilder.append(" )") } } class OrLogicExpression (val leftExpression:DynamicLogicExpression, val rightExpression:DynamicLogicExpression) extends DynamicLogicExpression{ override def execute(columnToCurrentRowValueMap: util.HashMap[String, ByteArrayComparable], valueFromQueryValueArray:Array[Array[Byte]]): Boolean = { leftExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray) || rightExpression.execute(columnToCurrentRowValueMap, valueFromQueryValueArray) } override def appendToExpression(strBuilder: StringBuilder): Unit = { strBuilder.append("( ") strBuilder.append(leftExpression.toExpressionString) strBuilder.append(" OR ") strBuilder.append(rightExpression.toExpressionString) strBuilder.append(" )") } } class EqualLogicExpression (val columnName:String, val valueFromQueryIndex:Int, val isNot:Boolean) extends DynamicLogicExpression{ override def execute(columnToCurrentRowValueMap: util.HashMap[String, ByteArrayComparable], valueFromQueryValueArray:Array[Array[Byte]]): Boolean = { val currentRowValue = columnToCurrentRowValueMap.get(columnName) val valueFromQuery = valueFromQueryValueArray(valueFromQueryIndex) currentRowValue != null && Bytes.equals(valueFromQuery, 0, valueFromQuery.length, currentRowValue.bytes, currentRowValue.offset, currentRowValue.length) != isNot } override def appendToExpression(strBuilder: StringBuilder): Unit = { val command = if (isNot) "!=" else "==" strBuilder.append(columnName + " " + command + " " + valueFromQueryIndex) } } class IsNullLogicExpression (val columnName:String, val isNot:Boolean) extends DynamicLogicExpression{ override def execute(columnToCurrentRowValueMap: util.HashMap[String, ByteArrayComparable], valueFromQueryValueArray:Array[Array[Byte]]): Boolean = { val currentRowValue = columnToCurrentRowValueMap.get(columnName) (currentRowValue == null) != isNot } override def appendToExpression(strBuilder: StringBuilder): Unit = { val command = if (isNot) "isNotNull" else "isNull" strBuilder.append(columnName + " " + command) } } class GreaterThanLogicExpression (val columnName:String, val valueFromQueryIndex:Int) extends DynamicLogicExpression{ override def execute(columnToCurrentRowValueMap: util.HashMap[String, ByteArrayComparable], valueFromQueryValueArray:Array[Array[Byte]]): Boolean = { val currentRowValue = columnToCurrentRowValueMap.get(columnName) val valueFromQuery = valueFromQueryValueArray(valueFromQueryIndex) currentRowValue != null && Bytes.compareTo(currentRowValue.bytes, currentRowValue.offset, currentRowValue.length, valueFromQuery, 0, valueFromQuery.length) > 0 } override def appendToExpression(strBuilder: StringBuilder): Unit = { strBuilder.append(columnName + " > " + valueFromQueryIndex) } } class GreaterThanOrEqualLogicExpression (val columnName:String, val valueFromQueryIndex:Int) extends DynamicLogicExpression{ override def execute(columnToCurrentRowValueMap: util.HashMap[String, ByteArrayComparable], valueFromQueryValueArray:Array[Array[Byte]]): Boolean = { val currentRowValue = columnToCurrentRowValueMap.get(columnName) val valueFromQuery = valueFromQueryValueArray(valueFromQueryIndex) currentRowValue != null && Bytes.compareTo(currentRowValue.bytes, currentRowValue.offset, currentRowValue.length, valueFromQuery, 0, valueFromQuery.length) >= 0 } override def appendToExpression(strBuilder: StringBuilder): Unit = { strBuilder.append(columnName + " >= " + valueFromQueryIndex) } } class LessThanLogicExpression (val columnName:String, val valueFromQueryIndex:Int) extends DynamicLogicExpression{ override def execute(columnToCurrentRowValueMap: util.HashMap[String, ByteArrayComparable], valueFromQueryValueArray:Array[Array[Byte]]): Boolean = { val currentRowValue = columnToCurrentRowValueMap.get(columnName) val valueFromQuery = valueFromQueryValueArray(valueFromQueryIndex) currentRowValue != null && Bytes.compareTo(currentRowValue.bytes, currentRowValue.offset, currentRowValue.length, valueFromQuery, 0, valueFromQuery.length) < 0 } override def appendToExpression(strBuilder: StringBuilder): Unit = { strBuilder.append(columnName + " < " + valueFromQueryIndex) } } class LessThanOrEqualLogicExpression (val columnName:String, val valueFromQueryIndex:Int) extends DynamicLogicExpression{ override def execute(columnToCurrentRowValueMap: util.HashMap[String, ByteArrayComparable], valueFromQueryValueArray:Array[Array[Byte]]): Boolean = { val currentRowValue = columnToCurrentRowValueMap.get(columnName) val valueFromQuery = valueFromQueryValueArray(valueFromQueryIndex) currentRowValue != null && Bytes.compareTo(currentRowValue.bytes, currentRowValue.offset, currentRowValue.length, valueFromQuery, 0, valueFromQuery.length) <= 0 } override def appendToExpression(strBuilder: StringBuilder): Unit = { strBuilder.append(columnName + " <= " + valueFromQueryIndex) } } class PassThroughLogicExpression() extends DynamicLogicExpression { override def execute(columnToCurrentRowValueMap: util.HashMap[String, ByteArrayComparable], valueFromQueryValueArray: Array[Array[Byte]]): Boolean = true override def appendToExpression(strBuilder: StringBuilder): Unit = { strBuilder.append("Pass") } } object DynamicLogicExpressionBuilder { def build(expressionString:String): DynamicLogicExpression = { val expressionAndOffset = build(expressionString.split(' '), 0) expressionAndOffset._1 } private def build(expressionArray:Array[String], offSet:Int): (DynamicLogicExpression, Int) = { if (expressionArray(offSet).equals("(")) { val left = build(expressionArray, offSet + 1) val right = build(expressionArray, left._2 + 1) if (expressionArray(left._2).equals("AND")) { (new AndLogicExpression(left._1, right._1), right._2 + 1) } else if (expressionArray(left._2).equals("OR")) { (new OrLogicExpression(left._1, right._1), right._2 + 1) } else { throw new Throwable("Unknown gate:" + expressionArray(left._2)) } } else { val command = expressionArray(offSet + 1) if (command.equals("<")) { (new LessThanLogicExpression(expressionArray(offSet), expressionArray(offSet + 2).toInt), offSet + 3) } else if (command.equals("<=")) { (new LessThanOrEqualLogicExpression(expressionArray(offSet), expressionArray(offSet + 2).toInt), offSet + 3) } else if (command.equals(">")) { (new GreaterThanLogicExpression(expressionArray(offSet), expressionArray(offSet + 2).toInt), offSet + 3) } else if (command.equals(">=")) { (new GreaterThanOrEqualLogicExpression(expressionArray(offSet), expressionArray(offSet + 2).toInt), offSet + 3) } else if (command.equals("==")) { (new EqualLogicExpression(expressionArray(offSet), expressionArray(offSet + 2).toInt, false), offSet + 3) } else if (command.equals("!=")) { (new EqualLogicExpression(expressionArray(offSet), expressionArray(offSet + 2).toInt, true), offSet + 3) } else if (command.equals("isNull")) { (new IsNullLogicExpression(expressionArray(offSet), false), offSet + 2) } else if (command.equals("isNotNull")) { (new IsNullLogicExpression(expressionArray(offSet), true), offSet + 2) } else if (command.equals("Pass")) { (new PassThroughLogicExpression, offSet + 2) } else { throw new Throwable("Unknown logic command:" + command) } } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/FamiliesQualifiersValues.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark import java.util /** * This object is a clean way to store and sort all cells that will be bulk * loaded into a single row */ class FamiliesQualifiersValues extends Serializable { //Tree maps are used because we need the results to // be sorted when we read them val familyMap = new util.TreeMap[ByteArrayWrapper, util.TreeMap[ByteArrayWrapper, Array[Byte]]]() //normally in a row there are more columns then //column families this wrapper is reused for column //family look ups val reusableWrapper = new ByteArrayWrapper(null) /** * Adds a new cell to an existing row * @param family HBase column family * @param qualifier HBase column qualifier * @param value HBase cell value */ def += (family: Array[Byte], qualifier: Array[Byte], value: Array[Byte]): Unit = { reusableWrapper.value = family var qualifierValues = familyMap.get(reusableWrapper) if (qualifierValues == null) { qualifierValues = new util.TreeMap[ByteArrayWrapper, Array[Byte]]() familyMap.put(new ByteArrayWrapper(family), qualifierValues) } qualifierValues.put(new ByteArrayWrapper(qualifier), value) } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/FamilyHFileWriteOptions.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark import java.io.Serializable /** * This object will hold optional data for how a given column family's * writer will work * * @param compression String to define the Compression to be used in the HFile * @param bloomType String to define the bloom type to be used in the HFile * @param blockSize The block size to be used in the HFile * @param dataBlockEncoding String to define the data block encoding to be used * in the HFile */ class FamilyHFileWriteOptions( val compression:String, val bloomType: String, val blockSize: Int, val dataBlockEncoding: String) extends Serializable ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/HBaseContext.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark import java.io._ import java.net.InetSocketAddress import java.util import java.util.UUID import javax.management.openmbean.KeyAlreadyExistsException import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hbase._ import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.fs.HFileSystem import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.io.compress.Compression import org.apache.hadoop.hbase.io.compress.Compression.Algorithm import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding import org.apache.hadoop.hbase.io.hfile.{AbstractHFileWriter, CacheConfig, HFile, HFileContextBuilder} import org.apache.hadoop.hbase.mapreduce.{IdentityTableMapper, TableInputFormat, TableMapReduceUtil} import org.apache.hadoop.hbase.regionserver.{BloomType, HStore, StoreFile} import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.mapreduce.Job import org.apache.spark.broadcast.Broadcast import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import org.apache.spark.{SerializableWritable, SparkContext} import org.apache.spark.internal.Logging import scala.collection.mutable import scala.reflect.ClassTag /** * HBaseContext is a façade for HBase operations * like bulk put, get, increment, delete, and scan * * HBaseContext will take the responsibilities * of disseminating the configuration information * to the working and managing the life cycle of HConnections. */ class HBaseContext(@transient sc: SparkContext, @transient val config: Configuration, val tmpHdfsConfgFile: String = null) extends Serializable with Logging { @transient var tmpHdfsConfiguration:Configuration = config val broadcastedConf = sc.broadcast(new SerializableWritable(config)) val connFactory = new ConnFactoryExtend LatestHBaseContextCache.latest = this if (tmpHdfsConfgFile != null && config != null) { val fs = FileSystem.newInstance(config) val tmpPath = new Path(tmpHdfsConfgFile) if (!fs.exists(tmpPath)) { val outputStream = fs.create(tmpPath) config.write(outputStream) outputStream.close() } else { logWarning("tmpHdfsConfigDir " + tmpHdfsConfgFile + " exist!!") } } /** * A simple enrichment of the traditional Spark RDD foreachPartition. * This function differs from the original in that it offers the * developer access to a already connected HConnection object * * Note: Do not close the HConnection object. All HConnection * management is handled outside this method * * @param rdd Original RDD with data to iterate over * @param f Function to be given a iterator to iterate through * the RDD values and a HConnection object to interact * with HBase */ def foreachPartition[T](rdd: RDD[T], f: (Iterator[T], Connection) => Unit):Unit = { rdd.foreachPartition( it => hbaseForeachPartition(broadcastedConf, it, f)) } /** * A simple enrichment of the traditional Spark Streaming dStream foreach * This function differs from the original in that it offers the * developer access to a already connected HConnection object * * Note: Do not close the HConnection object. All HConnection * management is handled outside this method * * @param dstream Original DStream with data to iterate over * @param f Function to be given a iterator to iterate through * the DStream values and a HConnection object to * interact with HBase */ def foreachPartition[T](dstream: DStream[T], f: (Iterator[T], Connection) => Unit):Unit = { dstream.foreachRDD((rdd, time) => { foreachPartition(rdd, f) }) } /** * A simple enrichment of the traditional Spark RDD mapPartition. * This function differs from the original in that it offers the * developer access to a already connected HConnection object * * Note: Do not close the HConnection object. All HConnection * management is handled outside this method * * @param rdd Original RDD with data to iterate over * @param mp Function to be given a iterator to iterate through * the RDD values and a HConnection object to interact * with HBase * @return Returns a new RDD generated by the user definition * function just like normal mapPartition */ def mapPartitions[T, R: ClassTag](rdd: RDD[T], mp: (Iterator[T], Connection) => Iterator[R]): RDD[R] = { rdd.mapPartitions[R](it => hbaseMapPartition[T, R](broadcastedConf, it, mp)) } /** * A simple enrichment of the traditional Spark Streaming DStream * foreachPartition. * * This function differs from the original in that it offers the * developer access to a already connected HConnection object * * Note: Do not close the HConnection object. All HConnection * management is handled outside this method * * Note: Make sure to partition correctly to avoid memory issue when * getting data from HBase * * @param dstream Original DStream with data to iterate over * @param f Function to be given a iterator to iterate through * the DStream values and a HConnection object to * interact with HBase * @return Returns a new DStream generated by the user * definition function just like normal mapPartition */ def streamForeachPartition[T](dstream: DStream[T], f: (Iterator[T], Connection) => Unit): Unit = { dstream.foreachRDD(rdd => this.foreachPartition(rdd, f)) } /** * A simple enrichment of the traditional Spark Streaming DStream * mapPartition. * * This function differs from the original in that it offers the * developer access to a already connected HConnection object * * Note: Do not close the HConnection object. All HConnection * management is handled outside this method * * Note: Make sure to partition correctly to avoid memory issue when * getting data from HBase * * @param dstream Original DStream with data to iterate over * @param f Function to be given a iterator to iterate through * the DStream values and a HConnection object to * interact with HBase * @return Returns a new DStream generated by the user * definition function just like normal mapPartition */ def streamMapPartitions[T, U: ClassTag](dstream: DStream[T], f: (Iterator[T], Connection) => Iterator[U]): DStream[U] = { dstream.mapPartitions(it => hbaseMapPartition[T, U]( broadcastedConf, it, f)) } /** * A simple abstraction over the HBaseContext.foreachPartition method. * * It allow addition support for a user to take RDD * and generate puts and send them to HBase. * The complexity of managing the HConnection is * removed from the developer * * @param rdd Original RDD with data to iterate over * @param tableName The name of the table to put into * @param f Function to convert a value in the RDD to a HBase Put */ def bulkPut[T](rdd: RDD[T], tableName: TableName, f: (T) => Put) { val tName = tableName.getName rdd.foreachPartition( it => hbaseForeachPartition[T]( broadcastedConf, it, (iterator, connection) => { val m = connection.getBufferedMutator(TableName.valueOf(tName)) iterator.foreach(T => m.mutate(f(T))) m.flush() m.close() })) } /** * A simple abstraction over the HBaseContext.streamMapPartition method. * * It allow addition support for a user to take a DStream and * generate puts and send them to HBase. * * The complexity of managing the HConnection is * removed from the developer * * @param dstream Original DStream with data to iterate over * @param tableName The name of the table to put into * @param f Function to convert a value in * the DStream to a HBase Put */ def streamBulkPut[T](dstream: DStream[T], tableName: TableName, f: (T) => Put) = { val tName = tableName.getName dstream.foreachRDD((rdd, time) => { bulkPut(rdd, TableName.valueOf(tName), f) }) } /** * A simple abstraction over the HBaseContext.foreachPartition method. * * It allow addition support for a user to take a RDD and generate delete * and send them to HBase. The complexity of managing the HConnection is * removed from the developer * * @param rdd Original RDD with data to iterate over * @param tableName The name of the table to delete from * @param f Function to convert a value in the RDD to a * HBase Deletes * @param batchSize The number of delete to batch before sending to HBase */ def bulkDelete[T](rdd: RDD[T], tableName: TableName, f: (T) => Delete, batchSize: Integer) { bulkMutation(rdd, tableName, f, batchSize) } /** * A simple abstraction over the HBaseContext.streamBulkMutation method. * * It allow addition support for a user to take a DStream and * generate Delete and send them to HBase. * * The complexity of managing the HConnection is * removed from the developer * * @param dstream Original DStream with data to iterate over * @param tableName The name of the table to delete from * @param f function to convert a value in the DStream to a * HBase Delete * @param batchSize The number of deletes to batch before sending to HBase */ def streamBulkDelete[T](dstream: DStream[T], tableName: TableName, f: (T) => Delete, batchSize: Integer) = { streamBulkMutation(dstream, tableName, f, batchSize) } /** * Under lining function to support all bulk mutations * * May be opened up if requested */ private def bulkMutation[T](rdd: RDD[T], tableName: TableName, f: (T) => Mutation, batchSize: Integer) { val tName = tableName.getName rdd.foreachPartition( it => hbaseForeachPartition[T]( broadcastedConf, it, (iterator, connection) => { val table = connection.getTable(TableName.valueOf(tName)) val mutationList = new java.util.ArrayList[Mutation] iterator.foreach(T => { mutationList.add(f(T)) if (mutationList.size >= batchSize) { table.batch(mutationList, null) mutationList.clear() } }) if (mutationList.size() > 0) { table.batch(mutationList, null) mutationList.clear() } table.close() })) } /** * Under lining function to support all bulk streaming mutations * * May be opened up if requested */ private def streamBulkMutation[T](dstream: DStream[T], tableName: TableName, f: (T) => Mutation, batchSize: Integer) = { val tName = tableName.getName dstream.foreachRDD((rdd, time) => { bulkMutation(rdd, TableName.valueOf(tName), f, batchSize) }) } /** * A simple abstraction over the HBaseContext.mapPartition method. * * It allow addition support for a user to take a RDD and generates a * new RDD based on Gets and the results they bring back from HBase * * @param rdd Original RDD with data to iterate over * @param tableName The name of the table to get from * @param makeGet function to convert a value in the RDD to a * HBase Get * @param convertResult This will convert the HBase Result object to * what ever the user wants to put in the resulting * RDD * return new RDD that is created by the Get to HBase */ def bulkGet[T, U: ClassTag](tableName: TableName, batchSize: Integer, rdd: RDD[T], makeGet: (T) => Get, convertResult: (Result) => U): RDD[U] = { val getMapPartition = new GetMapPartition(tableName, batchSize, makeGet, convertResult) rdd.mapPartitions[U](it => hbaseMapPartition[T, U]( broadcastedConf, it, getMapPartition.run)) } /** * A simple abstraction over the HBaseContext.streamMap method. * * It allow addition support for a user to take a DStream and * generates a new DStream based on Gets and the results * they bring back from HBase * * @param tableName The name of the table to get from * @param batchSize The number of Gets to be sent in a single batch * @param dStream Original DStream with data to iterate over * @param makeGet Function to convert a value in the DStream to a * HBase Get * @param convertResult This will convert the HBase Result object to * what ever the user wants to put in the resulting * DStream * @return A new DStream that is created by the Get to HBase */ def streamBulkGet[T, U: ClassTag](tableName: TableName, batchSize: Integer, dStream: DStream[T], makeGet: (T) => Get, convertResult: (Result) => U): DStream[U] = { val getMapPartition = new GetMapPartition(tableName, batchSize, makeGet, convertResult) dStream.mapPartitions[U](it => hbaseMapPartition[T, U]( broadcastedConf, it, getMapPartition.run)) } /** * This function will use the native HBase TableInputFormat with the * given scan object to generate a new RDD * * @param tableName the name of the table to scan * @param scan the HBase scan object to use to read data from HBase * @param f function to convert a Result object from HBase into * what the user wants in the final generated RDD * @return new RDD with results from scan */ def hbaseRDD[U: ClassTag](tableName: TableName, scan: Scan, f: ((ImmutableBytesWritable, Result)) => U): RDD[U] = { val job: Job = Job.getInstance(getConf(broadcastedConf)) TableMapReduceUtil.initTableMapperJob(tableName, scan, classOf[IdentityTableMapper], null, null, job) new NewHBaseRDD(sc, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result], job.getConfiguration, this).map(f) } /** * A overloaded version of HBaseContext hbaseRDD that defines the * type of the resulting RDD * * @param tableName the name of the table to scan * @param scans the HBase scan object to use to read data from HBase * @return New RDD with results from scan * */ def hbaseRDD(tableName: TableName, scans: Scan): RDD[(ImmutableBytesWritable, Result)] = { hbaseRDD[(ImmutableBytesWritable, Result)]( tableName, scans, (r: (ImmutableBytesWritable, Result)) => r) } /** * underlining wrapper all foreach functions in HBaseContext */ private def hbaseForeachPartition[T](configBroadcast: Broadcast[SerializableWritable[Configuration]], it: Iterator[T], f: (Iterator[T], Connection) => Unit) = { val config = getConf(configBroadcast) // specify that this is a proxy user val connection = connFactory.getConnectionInstance(config) f(it, connection) } private def getConf(configBroadcast: Broadcast[SerializableWritable[Configuration]]): Configuration = { if (tmpHdfsConfiguration == null && tmpHdfsConfgFile != null) { val fs = FileSystem.newInstance(SparkHadoopUtil.get.conf) val inputStream = fs.open(new Path(tmpHdfsConfgFile)) tmpHdfsConfiguration = new Configuration(false) tmpHdfsConfiguration.readFields(inputStream) inputStream.close() } if (tmpHdfsConfiguration == null) { try { tmpHdfsConfiguration = configBroadcast.value.value } catch { case ex: Exception => logError("Unable to getConfig from broadcast", ex) } } tmpHdfsConfiguration } /** * underlining wrapper all mapPartition functions in HBaseContext * */ private def hbaseMapPartition[K, U]( configBroadcast: Broadcast[SerializableWritable[Configuration]], it: Iterator[K], mp: (Iterator[K], Connection) => Iterator[U]): Iterator[U] = { val config = getConf(configBroadcast) val connection = connFactory.getConnectionInstance(config) val res = mp(it, connection) res } /** * underlining wrapper all get mapPartition functions in HBaseContext */ private class GetMapPartition[T, U](tableName: TableName, batchSize: Integer, makeGet: (T) => Get, convertResult: (Result) => U) extends Serializable { val tName = tableName.getName def run(iterator: Iterator[T], connection: Connection): Iterator[U] = { val table = connection.getTable(TableName.valueOf(tName)) val gets = new java.util.ArrayList[Get]() var res = List[U]() while (iterator.hasNext) { gets.add(makeGet(iterator.next())) if (gets.size() == batchSize) { val results = table.get(gets) res = res ++ results.map(convertResult) gets.clear() } } if (gets.size() > 0) { val results = table.get(gets) res = res ++ results.map(convertResult) gets.clear() } table.close() res.iterator } } /** * Produces a ClassTag[T], which is actually just a casted ClassTag[AnyRef]. * * This method is used to keep ClassTags out of the external Java API, as * the Java compiler cannot produce them automatically. While this * ClassTag-faking does please the compiler, it can cause problems at runtime * if the Scala API relies on ClassTags for correctness. * * Often, though, a ClassTag[AnyRef] will not lead to incorrect behavior, * just worse performance or security issues. * For instance, an Array of AnyRef can hold any type T, but may lose primitive * specialization. */ private[spark] def fakeClassTag[T]: ClassTag[T] = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] /** * Spark Implementation of HBase Bulk load for wide rows or when * values are not already combined at the time of the map process * * This will take the content from an existing RDD then sort and shuffle * it with respect to region splits. The result of that sort and shuffle * will be written to HFiles. * * After this function is executed the user will have to call * LoadIncrementalHFiles.doBulkLoad(...) to move the files into HBase * * Also note this version of bulk load is different from past versions in * that it includes the qualifier as part of the sort process. The * reason for this is to be able to support rows will very large number * of columns. * * @param rdd The RDD we are bulk loading from * @param tableName The HBase table we are loading into * @param flatMap A flapMap function that will make every * row in the RDD * into N cells for the bulk load * @param stagingDir The location on the FileSystem to bulk load into * @param familyHFileWriteOptionsMap Options that will define how the HFile for a * column family is written * @param compactionExclude Compaction excluded for the HFiles * @param maxSize Max size for the HFiles before they roll * @tparam T The Type of values in the original RDD */ def bulkLoad[T](rdd:RDD[T], tableName: TableName, flatMap: (T) => Iterator[(KeyFamilyQualifier, Array[Byte])], stagingDir:String, familyHFileWriteOptionsMap: util.Map[Array[Byte], FamilyHFileWriteOptions] = new util.HashMap[Array[Byte], FamilyHFileWriteOptions], compactionExclude: Boolean = false, maxSize:Long = HConstants.DEFAULT_MAX_FILE_SIZE): Unit = { val conn = connFactory.getConnectionInstance(config) val regionLocator = conn.getRegionLocator(tableName) val startKeys = regionLocator.getStartKeys val defaultCompressionStr = config.get("hfile.compression", Compression.Algorithm.NONE.getName) val hfileCompression = AbstractHFileWriter.compressionByName(defaultCompressionStr) val nowTimeStamp = System.currentTimeMillis() val tableRawName = tableName.getName val familyHFileWriteOptionsMapInternal = new util.HashMap[ByteArrayWrapper, FamilyHFileWriteOptions] val entrySetIt = familyHFileWriteOptionsMap.entrySet().iterator() while (entrySetIt.hasNext) { val entry = entrySetIt.next() familyHFileWriteOptionsMapInternal.put(new ByteArrayWrapper(entry.getKey), entry.getValue) } val regionSplitPartitioner = new BulkLoadPartitioner(startKeys) //This is where all the magic happens //Here we are going to do the following things // 1. FlapMap every row in the RDD into key column value tuples // 2. Then we are going to repartition sort and shuffle // 3. Finally we are going to write out our HFiles rdd.flatMap( r => flatMap(r)). repartitionAndSortWithinPartitions(regionSplitPartitioner). hbaseForeachPartition(this, (it, conn) => { val conf = broadcastedConf.value.value val fs = FileSystem.get(conf) val writerMap = new mutable.HashMap[ByteArrayWrapper, WriterLength] var previousRow:Array[Byte] = HConstants.EMPTY_BYTE_ARRAY var rollOverRequested = false val localTableName = TableName.valueOf(tableRawName) //Here is where we finally iterate through the data in this partition of the //RDD that has been sorted and partitioned it.foreach{ case (keyFamilyQualifier, cellValue:Array[Byte]) => val wl = writeValueToHFile(keyFamilyQualifier.rowKey, keyFamilyQualifier.family, keyFamilyQualifier.qualifier, cellValue, nowTimeStamp, fs, conn, localTableName, conf, familyHFileWriteOptionsMapInternal, hfileCompression, writerMap, stagingDir) rollOverRequested = rollOverRequested || wl.written > maxSize //This will only roll if we have at least one column family file that is //bigger then maxSize and we have finished a given row key if (rollOverRequested && Bytes.compareTo(previousRow, keyFamilyQualifier.rowKey) != 0) { rollWriters(fs, writerMap, regionSplitPartitioner, previousRow, compactionExclude) rollOverRequested = false } previousRow = keyFamilyQualifier.rowKey } //We have finished all the data so lets close up the writers rollWriters(fs, writerMap, regionSplitPartitioner, previousRow, compactionExclude) rollOverRequested = false }) } /** * Spark Implementation of HBase Bulk load for short rows some where less then * a 1000 columns. This bulk load should be faster for tables will thinner * rows then the other spark implementation of bulk load that puts only one * value into a record going into a shuffle * * This will take the content from an existing RDD then sort and shuffle * it with respect to region splits. The result of that sort and shuffle * will be written to HFiles. * * After this function is executed the user will have to call * LoadIncrementalHFiles.doBulkLoad(...) to move the files into HBase * * In this implementation, only the rowKey is given to the shuffle as the key * and all the columns are already linked to the RowKey before the shuffle * stage. The sorting of the qualifier is done in memory out side of the * shuffle stage * * Also make sure that incoming RDDs only have one record for every row key. * * @param rdd The RDD we are bulk loading from * @param tableName The HBase table we are loading into * @param mapFunction A function that will convert the RDD records to * the key value format used for the shuffle to prep * for writing to the bulk loaded HFiles * @param stagingDir The location on the FileSystem to bulk load into * @param familyHFileWriteOptionsMap Options that will define how the HFile for a * column family is written * @param compactionExclude Compaction excluded for the HFiles * @param maxSize Max size for the HFiles before they roll * @tparam T The Type of values in the original RDD */ def bulkLoadThinRows[T](rdd:RDD[T], tableName: TableName, mapFunction: (T) => (ByteArrayWrapper, FamiliesQualifiersValues), stagingDir:String, familyHFileWriteOptionsMap: util.Map[Array[Byte], FamilyHFileWriteOptions] = new util.HashMap[Array[Byte], FamilyHFileWriteOptions], compactionExclude: Boolean = false, maxSize:Long = HConstants.DEFAULT_MAX_FILE_SIZE): Unit = { val conn = connFactory.getConnectionInstance(config) val regionLocator = conn.getRegionLocator(tableName) val startKeys = regionLocator.getStartKeys val defaultCompressionStr = config.get("hfile.compression", Compression.Algorithm.NONE.getName) val defaultCompression = AbstractHFileWriter.compressionByName(defaultCompressionStr) val nowTimeStamp = System.currentTimeMillis() val tableRawName = tableName.getName val familyHFileWriteOptionsMapInternal = new util.HashMap[ByteArrayWrapper, FamilyHFileWriteOptions] val entrySetIt = familyHFileWriteOptionsMap.entrySet().iterator() while (entrySetIt.hasNext) { val entry = entrySetIt.next() familyHFileWriteOptionsMapInternal.put(new ByteArrayWrapper(entry.getKey), entry.getValue) } val regionSplitPartitioner = new BulkLoadPartitioner(startKeys) //This is where all the magic happens //Here we are going to do the following things // 1. FlapMap every row in the RDD into key column value tuples // 2. Then we are going to repartition sort and shuffle // 3. Finally we are going to write out our HFiles rdd.map( r => mapFunction(r)). repartitionAndSortWithinPartitions(regionSplitPartitioner). hbaseForeachPartition(this, (it, conn) => { val conf = broadcastedConf.value.value val fs = FileSystem.get(conf) val writerMap = new mutable.HashMap[ByteArrayWrapper, WriterLength] var previousRow:Array[Byte] = HConstants.EMPTY_BYTE_ARRAY var rollOverRequested = false val localTableName = TableName.valueOf(tableRawName) //Here is where we finally iterate through the data in this partition of the //RDD that has been sorted and partitioned it.foreach{ case (rowKey:ByteArrayWrapper, familiesQualifiersValues:FamiliesQualifiersValues) => if (Bytes.compareTo(previousRow, rowKey.value) == 0) { throw new KeyAlreadyExistsException("The following key was sent to the " + "HFile load more then one: " + Bytes.toString(previousRow)) } //The family map is a tree map so the families will be sorted val familyIt = familiesQualifiersValues.familyMap.entrySet().iterator() while (familyIt.hasNext) { val familyEntry = familyIt.next() val family = familyEntry.getKey.value val qualifierIt = familyEntry.getValue.entrySet().iterator() //The qualifier map is a tree map so the families will be sorted while (qualifierIt.hasNext) { val qualifierEntry = qualifierIt.next() val qualifier = qualifierEntry.getKey val cellValue = qualifierEntry.getValue writeValueToHFile(rowKey.value, family, qualifier.value, // qualifier cellValue, // value nowTimeStamp, fs, conn, localTableName, conf, familyHFileWriteOptionsMapInternal, defaultCompression, writerMap, stagingDir) previousRow = rowKey.value } writerMap.values.foreach( wl => { rollOverRequested = rollOverRequested || wl.written > maxSize //This will only roll if we have at least one column family file that is //bigger then maxSize and we have finished a given row key if (rollOverRequested) { rollWriters(fs, writerMap, regionSplitPartitioner, previousRow, compactionExclude) rollOverRequested = false } }) } } //This will get a writer for the column family //If there is no writer for a given column family then //it will get created here. //We have finished all the data so lets close up the writers rollWriters(fs, writerMap, regionSplitPartitioner, previousRow, compactionExclude) rollOverRequested = false }) } /** * This will return a new HFile writer when requested * * @param family column family * @param conf configuration to connect to HBase * @param favoredNodes nodes that we would like to write too * @param fs FileSystem object where we will be writing the HFiles to * @return WriterLength object */ private def getNewHFileWriter(family: Array[Byte], conf: Configuration, favoredNodes: Array[InetSocketAddress], fs:FileSystem, familydir:Path, familyHFileWriteOptionsMapInternal: util.HashMap[ByteArrayWrapper, FamilyHFileWriteOptions], defaultCompression:Compression.Algorithm): WriterLength = { var familyOptions = familyHFileWriteOptionsMapInternal.get(new ByteArrayWrapper(family)) if (familyOptions == null) { familyOptions = new FamilyHFileWriteOptions(defaultCompression.toString, BloomType.NONE.toString, HConstants.DEFAULT_BLOCKSIZE, DataBlockEncoding.NONE.toString) familyHFileWriteOptionsMapInternal.put(new ByteArrayWrapper(family), familyOptions) } val tempConf = new Configuration(conf) tempConf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f) val contextBuilder = new HFileContextBuilder() .withCompression(Algorithm.valueOf(familyOptions.compression)) .withChecksumType(HStore.getChecksumType(conf)) .withBytesPerCheckSum(HStore.getBytesPerChecksum(conf)) .withBlockSize(familyOptions.blockSize) if (HFile.getFormatVersion(conf) >= HFile.MIN_FORMAT_VERSION_WITH_TAGS) { contextBuilder.withIncludesTags(true) } contextBuilder.withDataBlockEncoding(DataBlockEncoding. valueOf(familyOptions.dataBlockEncoding)) val hFileContext = contextBuilder.build() //Add a '_' to the file name because this is a unfinished file. A rename will happen // to remove the '_' when the file is closed. new WriterLength(0, new StoreFile.WriterBuilder(conf, new CacheConfig(tempConf), new HFileSystem(fs)) .withBloomType(BloomType.valueOf(familyOptions.bloomType)) .withComparator(KeyValue.COMPARATOR).withFileContext(hFileContext) .withFilePath(new Path(familydir, "_" + UUID.randomUUID.toString.replaceAll("-", ""))) .withFavoredNodes(favoredNodes).build()) } /** * Encompasses the logic to write a value to an HFile * * @param rowKey The RowKey for the record * @param family HBase column family for the record * @param qualifier HBase column qualifier for the record * @param cellValue HBase cell value * @param nowTimeStamp The cell time stamp * @param fs Connection to the FileSystem for the HFile * @param conn Connection to HBaes * @param tableName HBase TableName object * @param conf Configuration to be used when making a new HFile * @param familyHFileWriteOptionsMapInternal Extra configs for the HFile * @param hfileCompression The compression codec for the new HFile * @param writerMap HashMap of existing writers and their offsets * @param stagingDir The staging directory on the FileSystem to store * the HFiles * @return The writer for the given HFile that was writen * too */ private def writeValueToHFile(rowKey: Array[Byte], family: Array[Byte], qualifier: Array[Byte], cellValue:Array[Byte], nowTimeStamp: Long, fs: FileSystem, conn: Connection, tableName: TableName, conf: Configuration, familyHFileWriteOptionsMapInternal: util.HashMap[ByteArrayWrapper, FamilyHFileWriteOptions], hfileCompression:Compression.Algorithm, writerMap:mutable.HashMap[ByteArrayWrapper, WriterLength], stagingDir: String ): WriterLength = { val wl = writerMap.getOrElseUpdate(new ByteArrayWrapper(family), { val familyDir = new Path(stagingDir, Bytes.toString(family)) fs.mkdirs(familyDir) val loc:HRegionLocation = { try { val locator = conn.getRegionLocator(tableName) locator.getRegionLocation(rowKey) } catch { case e: Throwable => logWarning("there's something wrong when locating rowkey: " + Bytes.toString(rowKey)) null } } if (null == loc) { if (log.isTraceEnabled) { logTrace("failed to get region location, so use default writer: " + Bytes.toString(rowKey)) } getNewHFileWriter(family = family, conf = conf, favoredNodes = null, fs = fs, familydir = familyDir, familyHFileWriteOptionsMapInternal, hfileCompression) } else { if (log.isDebugEnabled) { logDebug("first rowkey: [" + Bytes.toString(rowKey) + "]") } val initialIsa = new InetSocketAddress(loc.getHostname, loc.getPort) if (initialIsa.isUnresolved) { if (log.isTraceEnabled) { logTrace("failed to resolve bind address: " + loc.getHostname + ":" + loc.getPort + ", so use default writer") } getNewHFileWriter(family, conf, null, fs, familyDir, familyHFileWriteOptionsMapInternal, hfileCompression) } else { if(log.isDebugEnabled) { logDebug("use favored nodes writer: " + initialIsa.getHostString) } getNewHFileWriter(family, conf, Array[InetSocketAddress](initialIsa), fs, familyDir, familyHFileWriteOptionsMapInternal, hfileCompression) } } }) val keyValue =new KeyValue(rowKey, family, qualifier, nowTimeStamp,cellValue) wl.writer.append(keyValue) wl.written += keyValue.getLength wl } /** * This will roll all Writers * @param fs Hadoop FileSystem object * @param writerMap HashMap that contains all the writers * @param regionSplitPartitioner The partitioner with knowledge of how the * Region's are split by row key * @param previousRow The last row to fill the HFile ending range metadata * @param compactionExclude The exclude compaction metadata flag for the HFile */ private def rollWriters(fs:FileSystem, writerMap:mutable.HashMap[ByteArrayWrapper, WriterLength], regionSplitPartitioner: BulkLoadPartitioner, previousRow: Array[Byte], compactionExclude: Boolean): Unit = { writerMap.values.foreach( wl => { if (wl.writer != null) { logDebug("Writer=" + wl.writer.getPath + (if (wl.written == 0) "" else ", wrote=" + wl.written)) closeHFileWriter(fs, wl.writer, regionSplitPartitioner, previousRow, compactionExclude) } }) writerMap.clear() } /** * Function to close an HFile * @param fs Hadoop FileSystem object * @param w HFile Writer * @param regionSplitPartitioner The partitioner with knowledge of how the * Region's are split by row key * @param previousRow The last row to fill the HFile ending range metadata * @param compactionExclude The exclude compaction metadata flag for the HFile */ private def closeHFileWriter(fs:FileSystem, w: StoreFile.Writer, regionSplitPartitioner: BulkLoadPartitioner, previousRow: Array[Byte], compactionExclude: Boolean): Unit = { if (w != null) { w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY, Bytes.toBytes(System.currentTimeMillis())) w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY, Bytes.toBytes(regionSplitPartitioner.getPartition(previousRow))) w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY, Bytes.toBytes(true)) w.appendFileInfo(StoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY, Bytes.toBytes(compactionExclude)) w.appendTrackedTimestampsToMetadata() w.close() val srcPath = w.getPath //In the new path you will see that we are using substring. This is to // remove the '_' character in front of the HFile name. '_' is a character // that will tell HBase that this file shouldn't be included in the bulk load // This feature is to protect for unfinished HFiles being submitted to HBase val newPath = new Path(w.getPath.getParent, w.getPath.getName.substring(1)) if (!fs.rename(srcPath, newPath)) { throw new IOException("Unable to rename '" + srcPath + "' to " + newPath) } } } //single hbase connection close def cleanup(): Unit = { connFactory.cleanupInstance() } /** * This is a wrapper class around StoreFile.Writer. The reason for the * wrapper is to keep the length of the file along side the writer * * @param written The writer to be wrapped * @param writer The number of bytes written to the writer */ class WriterLength(var written:Long, val writer:StoreFile.Writer) } object LatestHBaseContextCache { var latest:HBaseContext = null } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/HBaseDStreamFunctions.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark import org.apache.hadoop.hbase.TableName import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.spark.streaming.dstream.DStream import scala.reflect.ClassTag /** * HBaseDStreamFunctions contains a set of implicit functions that can be * applied to a Spark DStream so that we can easily interact with HBase */ object HBaseDStreamFunctions { /** * These are implicit methods for a DStream that contains any type of * data. * * @param dStream This is for dStreams of any type * @tparam T Type T */ implicit class GenericHBaseDStreamFunctions[T](val dStream: DStream[T]) { /** * Implicit method that gives easy access to HBaseContext's bulk * put. This will not return a new Stream. Think of it like a foreach * * @param hc The hbaseContext object to identify which * HBase cluster connection to use * @param tableName The tableName that the put will be sent to * @param f The function that will turn the DStream values * into HBase Put objects. */ def hbaseBulkPut(hc: HBaseContext, tableName: TableName, f: (T) => Put): Unit = { hc.streamBulkPut(dStream, tableName, f) } /** * Implicit method that gives easy access to HBaseContext's bulk * get. This will return a new DStream. Think about it as a DStream map * function. In that every DStream value will get a new value out of * HBase. That new value will populate the newly generated DStream. * * @param hc The hbaseContext object to identify which * HBase cluster connection to use * @param tableName The tableName that the put will be sent to * @param batchSize How many gets to execute in a single batch * @param f The function that will turn the RDD values * in HBase Get objects * @param convertResult The function that will convert a HBase * Result object into a value that will go * into the resulting DStream * @tparam R The type of Object that will be coming * out of the resulting DStream * @return A resulting DStream with type R objects */ def hbaseBulkGet[R: ClassTag](hc: HBaseContext, tableName: TableName, batchSize:Int, f: (T) => Get, convertResult: (Result) => R): DStream[R] = { hc.streamBulkGet[T, R](tableName, batchSize, dStream, f, convertResult) } /** * Implicit method that gives easy access to HBaseContext's bulk * get. This will return a new DStream. Think about it as a DStream map * function. In that every DStream value will get a new value out of * HBase. That new value will populate the newly generated DStream. * * @param hc The hbaseContext object to identify which * HBase cluster connection to use * @param tableName The tableName that the put will be sent to * @param batchSize How many gets to execute in a single batch * @param f The function that will turn the RDD values * in HBase Get objects * @return A resulting DStream with type R objects */ def hbaseBulkGet(hc: HBaseContext, tableName: TableName, batchSize:Int, f: (T) => Get): DStream[(ImmutableBytesWritable, Result)] = { hc.streamBulkGet[T, (ImmutableBytesWritable, Result)]( tableName, batchSize, dStream, f, result => (new ImmutableBytesWritable(result.getRow), result)) } /** * Implicit method that gives easy access to HBaseContext's bulk * Delete. This will not return a new DStream. * * @param hc The hbaseContext object to identify which HBase * cluster connection to use * @param tableName The tableName that the deletes will be sent to * @param f The function that will convert the DStream value into * a HBase Delete Object * @param batchSize The number of Deletes to be sent in a single batch */ def hbaseBulkDelete(hc: HBaseContext, tableName: TableName, f:(T) => Delete, batchSize:Int): Unit = { hc.streamBulkDelete(dStream, tableName, f, batchSize) } /** * Implicit method that gives easy access to HBaseContext's * foreachPartition method. This will ack very much like a normal DStream * foreach method but for the fact that you will now have a HBase connection * while iterating through the values. * * @param hc The hbaseContext object to identify which HBase * cluster connection to use * @param f This function will get an iterator for a Partition of an * DStream along with a connection object to HBase */ def hbaseForeachPartition(hc: HBaseContext, f: (Iterator[T], Connection) => Unit): Unit = { hc.streamForeachPartition(dStream, f) } /** * Implicit method that gives easy access to HBaseContext's * mapPartitions method. This will ask very much like a normal DStream * map partitions method but for the fact that you will now have a * HBase connection while iterating through the values * * @param hc The hbaseContext object to identify which HBase * cluster connection to use * @param f This function will get an iterator for a Partition of an * DStream along with a connection object to HBase * @tparam R This is the type of objects that will go into the resulting * DStream * @return A resulting DStream of type R */ def hbaseMapPartitions[R: ClassTag](hc: HBaseContext, f: (Iterator[T], Connection) => Iterator[R]): DStream[R] = { hc.streamMapPartitions(dStream, f) } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/HBaseRDDFunctions.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark import java.util import org.apache.hadoop.hbase.{HConstants, TableName} import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.spark.rdd.RDD import scala.reflect.ClassTag /** * HBaseRDDFunctions contains a set of implicit functions that can be * applied to a Spark RDD so that we can easily interact with HBase */ object HBaseRDDFunctions { /** * These are implicit methods for a RDD that contains any type of * data. * * @param rdd This is for rdd of any type * @tparam T This is any type */ implicit class GenericHBaseRDDFunctions[T](val rdd: RDD[T]) { /** * Implicit method that gives easy access to HBaseContext's bulk * put. This will not return a new RDD. Think of it like a foreach * * @param hc The hbaseContext object to identify which * HBase cluster connection to use * @param tableName The tableName that the put will be sent to * @param f The function that will turn the RDD values * into HBase Put objects. */ def hbaseBulkPut(hc: HBaseContext, tableName: TableName, f: (T) => Put): Unit = { hc.bulkPut(rdd, tableName, f) } /** * Implicit method that gives easy access to HBaseContext's bulk * get. This will return a new RDD. Think about it as a RDD map * function. In that every RDD value will get a new value out of * HBase. That new value will populate the newly generated RDD. * * @param hc The hbaseContext object to identify which * HBase cluster connection to use * @param tableName The tableName that the put will be sent to * @param batchSize How many gets to execute in a single batch * @param f The function that will turn the RDD values * in HBase Get objects * @param convertResult The function that will convert a HBase * Result object into a value that will go * into the resulting RDD * @tparam R The type of Object that will be coming * out of the resulting RDD * @return A resulting RDD with type R objects */ def hbaseBulkGet[R: ClassTag](hc: HBaseContext, tableName: TableName, batchSize:Int, f: (T) => Get, convertResult: (Result) => R): RDD[R] = { hc.bulkGet[T, R](tableName, batchSize, rdd, f, convertResult) } /** * Implicit method that gives easy access to HBaseContext's bulk * get. This will return a new RDD. Think about it as a RDD map * function. In that every RDD value will get a new value out of * HBase. That new value will populate the newly generated RDD. * * @param hc The hbaseContext object to identify which * HBase cluster connection to use * @param tableName The tableName that the put will be sent to * @param batchSize How many gets to execute in a single batch * @param f The function that will turn the RDD values * in HBase Get objects * @return A resulting RDD with type R objects */ def hbaseBulkGet(hc: HBaseContext, tableName: TableName, batchSize:Int, f: (T) => Get): RDD[(ImmutableBytesWritable, Result)] = { hc.bulkGet[T, (ImmutableBytesWritable, Result)](tableName, batchSize, rdd, f, result => if (result != null && result.getRow != null) { (new ImmutableBytesWritable(result.getRow), result) } else { null }) } /** * Implicit method that gives easy access to HBaseContext's bulk * Delete. This will not return a new RDD. * * @param hc The hbaseContext object to identify which HBase * cluster connection to use * @param tableName The tableName that the deletes will be sent to * @param f The function that will convert the RDD value into * a HBase Delete Object * @param batchSize The number of Deletes to be sent in a single batch */ def hbaseBulkDelete(hc: HBaseContext, tableName: TableName, f:(T) => Delete, batchSize:Int): Unit = { hc.bulkDelete(rdd, tableName, f, batchSize) } /** * Implicit method that gives easy access to HBaseContext's * foreachPartition method. This will ack very much like a normal RDD * foreach method but for the fact that you will now have a HBase connection * while iterating through the values. * * @param hc The hbaseContext object to identify which HBase * cluster connection to use * @param f This function will get an iterator for a Partition of an * RDD along with a connection object to HBase */ def hbaseForeachPartition(hc: HBaseContext, f: (Iterator[T], Connection) => Unit): Unit = { hc.foreachPartition(rdd, f) } /** * Implicit method that gives easy access to HBaseContext's * mapPartitions method. This will ask very much like a normal RDD * map partitions method but for the fact that you will now have a * HBase connection while iterating through the values * * @param hc The hbaseContext object to identify which HBase * cluster connection to use * @param f This function will get an iterator for a Partition of an * RDD along with a connection object to HBase * @tparam R This is the type of objects that will go into the resulting * RDD * @return A resulting RDD of type R */ def hbaseMapPartitions[R: ClassTag](hc: HBaseContext, f: (Iterator[T], Connection) => Iterator[R]): RDD[R] = { hc.mapPartitions[T,R](rdd, f) } /** * Spark Implementation of HBase Bulk load for wide rows or when * values are not already combined at the time of the map process * * A Spark Implementation of HBase Bulk load * * This will take the content from an existing RDD then sort and shuffle * it with respect to region splits. The result of that sort and shuffle * will be written to HFiles. * * After this function is executed the user will have to call * LoadIncrementalHFiles.doBulkLoad(...) to move the files into HBase * * Also note this version of bulk load is different from past versions in * that it includes the qualifier as part of the sort process. The * reason for this is to be able to support rows will very large number * of columns. * * @param tableName The HBase table we are loading into * @param flatMap A flapMap function that will make every row in the RDD * into N cells for the bulk load * @param stagingDir The location on the FileSystem to bulk load into * @param familyHFileWriteOptionsMap Options that will define how the HFile for a * column family is written * @param compactionExclude Compaction excluded for the HFiles * @param maxSize Max size for the HFiles before they roll */ def hbaseBulkLoad(hc: HBaseContext, tableName: TableName, flatMap: (T) => Iterator[(KeyFamilyQualifier, Array[Byte])], stagingDir:String, familyHFileWriteOptionsMap: util.Map[Array[Byte], FamilyHFileWriteOptions] = new util.HashMap[Array[Byte], FamilyHFileWriteOptions](), compactionExclude: Boolean = false, maxSize:Long = HConstants.DEFAULT_MAX_FILE_SIZE):Unit = { hc.bulkLoad(rdd, tableName, flatMap, stagingDir, familyHFileWriteOptionsMap, compactionExclude, maxSize) } /** * Implicit method that gives easy access to HBaseContext's * bulkLoadThinRows method. * * Spark Implementation of HBase Bulk load for short rows some where less then * a 1000 columns. This bulk load should be faster for tables will thinner * rows then the other spark implementation of bulk load that puts only one * value into a record going into a shuffle * * This will take the content from an existing RDD then sort and shuffle * it with respect to region splits. The result of that sort and shuffle * will be written to HFiles. * * After this function is executed the user will have to call * LoadIncrementalHFiles.doBulkLoad(...) to move the files into HBase * * In this implementation only the rowKey is given to the shuffle as the key * and all the columns are already linked to the RowKey before the shuffle * stage. The sorting of the qualifier is done in memory out side of the * shuffle stage * * @param tableName The HBase table we are loading into * @param mapFunction A function that will convert the RDD records to * the key value format used for the shuffle to prep * for writing to the bulk loaded HFiles * @param stagingDir The location on the FileSystem to bulk load into * @param familyHFileWriteOptionsMap Options that will define how the HFile for a * column family is written * @param compactionExclude Compaction excluded for the HFiles * @param maxSize Max size for the HFiles before they roll */ def hbaseBulkLoadThinRows(hc: HBaseContext, tableName: TableName, mapFunction: (T) => (ByteArrayWrapper, FamiliesQualifiersValues), stagingDir:String, familyHFileWriteOptionsMap: util.Map[Array[Byte], FamilyHFileWriteOptions] = new util.HashMap[Array[Byte], FamilyHFileWriteOptions](), compactionExclude: Boolean = false, maxSize:Long = HConstants.DEFAULT_MAX_FILE_SIZE):Unit = { hc.bulkLoadThinRows(rdd, tableName, mapFunction, stagingDir, familyHFileWriteOptionsMap, compactionExclude, maxSize) } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/JavaHBaseContext.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark import org.apache.hadoop.hbase.TableName import org.apache.spark.api.java.JavaSparkContext import org.apache.hadoop.conf.Configuration import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.java.function.VoidFunction import org.apache.spark.api.java.function.Function import org.apache.hadoop.hbase.client.Connection import org.apache.spark.streaming.api.java.JavaDStream import org.apache.spark.api.java.function.FlatMapFunction import scala.collection.JavaConversions._ import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.client.Delete import org.apache.hadoop.hbase.client.Get import org.apache.hadoop.hbase.client.Result import org.apache.hadoop.hbase.client.Scan import org.apache.hadoop.hbase.io.ImmutableBytesWritable import scala.reflect.ClassTag /** * This is the Java Wrapper over HBaseContext which is written in * Scala. This class will be used by developers that want to * work with Spark or Spark Streaming in Java * * @param jsc This is the JavaSparkContext that we will wrap * @param config This is the config information to out HBase cluster */ class JavaHBaseContext(@transient jsc: JavaSparkContext, @transient config: Configuration) extends Serializable { val hbaseContext = new HBaseContext(jsc.sc, config) /** * A simple enrichment of the traditional Spark javaRdd foreachPartition. * This function differs from the original in that it offers the * developer access to a already connected HConnection object * * Note: Do not close the HConnection object. All HConnection * management is handled outside this method * * @param javaRdd Original javaRdd with data to iterate over * @param f Function to be given a iterator to iterate through * the RDD values and a HConnection object to interact * with HBase */ def foreachPartition[T](javaRdd: JavaRDD[T], f: VoidFunction[(java.util.Iterator[T], Connection)] ) = { hbaseContext.foreachPartition(javaRdd.rdd, (it:Iterator[T], conn:Connection) => { f.call((it, conn)) }) } /** * A simple enrichment of the traditional Spark Streaming dStream foreach * This function differs from the original in that it offers the * developer access to a already connected HConnection object * * Note: Do not close the HConnection object. All HConnection * management is handled outside this method * * @param javaDstream Original DStream with data to iterate over * @param f Function to be given a iterator to iterate through * the JavaDStream values and a HConnection object to * interact with HBase */ def foreachPartition[T](javaDstream: JavaDStream[T], f: VoidFunction[(Iterator[T], Connection)]) = { hbaseContext.foreachPartition(javaDstream.dstream, (it:Iterator[T], conn: Connection) => f.call(it, conn)) } /** * A simple enrichment of the traditional Spark JavaRDD mapPartition. * This function differs from the original in that it offers the * developer access to a already connected HConnection object * * Note: Do not close the HConnection object. All HConnection * management is handled outside this method * * Note: Make sure to partition correctly to avoid memory issue when * getting data from HBase * * @param javaRdd Original JavaRdd with data to iterate over * @param f Function to be given a iterator to iterate through * the RDD values and a HConnection object to interact * with HBase * @return Returns a new RDD generated by the user definition * function just like normal mapPartition */ def mapPartitions[T,R](javaRdd: JavaRDD[T], f: FlatMapFunction[(java.util.Iterator[T], Connection),R] ): JavaRDD[R] = { def fn = (it: Iterator[T], conn: Connection) => asScalaIterator( f.call((asJavaIterator(it), conn)) ) JavaRDD.fromRDD(hbaseContext.mapPartitions(javaRdd.rdd, (iterator:Iterator[T], connection:Connection) => fn(iterator, connection))(fakeClassTag[R]))(fakeClassTag[R]) } /** * A simple enrichment of the traditional Spark Streaming JavaDStream * mapPartition. * * This function differs from the original in that it offers the * developer access to a already connected HConnection object * * Note: Do not close the HConnection object. All HConnection * management is handled outside this method * * Note: Make sure to partition correctly to avoid memory issue when * getting data from HBase * * @param javaDstream Original JavaDStream with data to iterate over * @param mp Function to be given a iterator to iterate through * the JavaDStream values and a HConnection object to * interact with HBase * @return Returns a new JavaDStream generated by the user * definition function just like normal mapPartition */ def streamMap[T, U](javaDstream: JavaDStream[T], mp: Function[(Iterator[T], Connection), Iterator[U]]): JavaDStream[U] = { JavaDStream.fromDStream(hbaseContext.streamMapPartitions(javaDstream.dstream, (it: Iterator[T], conn: Connection) => mp.call(it, conn) )(fakeClassTag[U]))(fakeClassTag[U]) } /** * A simple abstraction over the HBaseContext.foreachPartition method. * * It allow addition support for a user to take JavaRDD * and generate puts and send them to HBase. * The complexity of managing the HConnection is * removed from the developer * * @param javaDdd Original JavaRDD with data to iterate over * @param tableName The name of the table to put into * @param f Function to convert a value in the JavaRDD * to a HBase Put */ def bulkPut[T](javaDdd: JavaRDD[T], tableName: TableName, f: Function[(T), Put]) { hbaseContext.bulkPut(javaDdd.rdd, tableName, (t:T) => f.call(t)) } /** * A simple abstraction over the HBaseContext.streamMapPartition method. * * It allow addition support for a user to take a JavaDStream and * generate puts and send them to HBase. * * The complexity of managing the HConnection is * removed from the developer * * @param javaDstream Original DStream with data to iterate over * @param tableName The name of the table to put into * @param f Function to convert a value in * the JavaDStream to a HBase Put */ def streamBulkPut[T](javaDstream: JavaDStream[T], tableName: TableName, f: Function[T,Put]) = { hbaseContext.streamBulkPut(javaDstream.dstream, tableName, (t:T) => f.call(t)) } /** * A simple abstraction over the HBaseContext.foreachPartition method. * * It allow addition support for a user to take a JavaRDD and * generate delete and send them to HBase. * * The complexity of managing the HConnection is * removed from the developer * * @param javaRdd Original JavaRDD with data to iterate over * @param tableName The name of the table to delete from * @param f Function to convert a value in the JavaRDD to a * HBase Deletes * @param batchSize The number of deletes to batch before sending to HBase */ def bulkDelete[T](javaRdd: JavaRDD[T], tableName: TableName, f: Function[T, Delete], batchSize:Integer) { hbaseContext.bulkDelete(javaRdd.rdd, tableName, (t:T) => f.call(t), batchSize) } /** * A simple abstraction over the HBaseContext.streamBulkMutation method. * * It allow addition support for a user to take a JavaDStream and * generate Delete and send them to HBase. * * The complexity of managing the HConnection is * removed from the developer * * @param javaDstream Original DStream with data to iterate over * @param tableName The name of the table to delete from * @param f Function to convert a value in the JavaDStream to a * HBase Delete * @param batchSize The number of deletes to be sent at once */ def streamBulkDelete[T](javaDstream: JavaDStream[T], tableName: TableName, f: Function[T, Delete], batchSize: Integer) = { hbaseContext.streamBulkDelete(javaDstream.dstream, tableName, (t:T) => f.call(t), batchSize) } /** * A simple abstraction over the HBaseContext.mapPartition method. * * It allow addition support for a user to take a JavaRDD and generates a * new RDD based on Gets and the results they bring back from HBase * * @param tableName The name of the table to get from * @param batchSize batch size of how many gets to retrieve in a single fetch * @param javaRdd Original JavaRDD with data to iterate over * @param makeGet Function to convert a value in the JavaRDD to a * HBase Get * @param convertResult This will convert the HBase Result object to * what ever the user wants to put in the resulting * JavaRDD * return new JavaRDD that is created by the Get to HBase */ def bulkGet[T, U](tableName: TableName, batchSize:Integer, javaRdd: JavaRDD[T], makeGet: Function[T, Get], convertResult: Function[Result, U]): JavaRDD[U] = { JavaRDD.fromRDD(hbaseContext.bulkGet[T, U](tableName, batchSize, javaRdd.rdd, (t:T) => makeGet.call(t), (r:Result) => {convertResult.call(r)})(fakeClassTag[U]))(fakeClassTag[U]) } /** * A simple abstraction over the HBaseContext.streamMap method. * * It allow addition support for a user to take a DStream and * generates a new DStream based on Gets and the results * they bring back from HBase * * @param tableName The name of the table to get from * @param batchSize The number of gets to be batched together * @param javaDStream Original DStream with data to iterate over * @param makeGet Function to convert a value in the JavaDStream to a * HBase Get * @param convertResult This will convert the HBase Result object to * what ever the user wants to put in the resulting * JavaDStream * return new JavaDStream that is created by the Get to HBase */ def streamBulkGet[T, U](tableName:TableName, batchSize:Integer, javaDStream: JavaDStream[T], makeGet: Function[T, Get], convertResult: Function[Result, U]) { JavaDStream.fromDStream(hbaseContext.streamBulkGet(tableName, batchSize, javaDStream.dstream, (t:T) => makeGet.call(t), (r:Result) => convertResult.call(r) )(fakeClassTag[U]))(fakeClassTag[U]) } /** * This function will use the native HBase TableInputFormat with the * given scan object to generate a new JavaRDD * * @param tableName the name of the table to scan * @param scans the HBase scan object to use to read data from HBase * @param f function to convert a Result object from HBase into * what the user wants in the final generated JavaRDD * @return new JavaRDD with results from scan */ def hbaseRDD[U](tableName: TableName, scans: Scan, f: Function[(ImmutableBytesWritable, Result), U]): JavaRDD[U] = { JavaRDD.fromRDD( hbaseContext.hbaseRDD[U](tableName, scans, (v:(ImmutableBytesWritable, Result)) => f.call(v._1, v._2))(fakeClassTag[U]))(fakeClassTag[U]) } /** * A overloaded version of HBaseContext hbaseRDD that define the * type of the resulting JavaRDD * * @param tableName the name of the table to scan * @param scans the HBase scan object to use to read data from HBase * @return New JavaRDD with results from scan * */ def hbaseRDD(tableName: TableName, scans: Scan): JavaRDD[(ImmutableBytesWritable, Result)] = { JavaRDD.fromRDD(hbaseContext.hbaseRDD(tableName, scans)) } /** * Produces a ClassTag[T], which is actually just a casted ClassTag[AnyRef]. * * This method is used to keep ClassTags out of the external Java API, as the Java compiler * cannot produce them automatically. While this ClassTag-faking does please the compiler, * it can cause problems at runtime if the Scala API relies on ClassTags for correctness. * * Often, though, a ClassTag[AnyRef] will not lead to incorrect behavior, * just worse performance or security issues. * For instance, an Array[AnyRef] can hold any type T, * but may lose primitive * specialization. */ private[spark] def fakeClassTag[T]: ClassTag[T] = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/KeyFamilyQualifier.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark import java.io.Serializable import org.apache.hadoop.hbase.util.Bytes /** * This is the key to be used for sorting and shuffling. * * We will only partition on the rowKey but we will sort on all three * * @param rowKey Record RowKey * @param family Record ColumnFamily * @param qualifier Cell Qualifier */ class KeyFamilyQualifier(val rowKey:Array[Byte], val family:Array[Byte], val qualifier:Array[Byte]) extends Comparable[KeyFamilyQualifier] with Serializable { override def compareTo(o: KeyFamilyQualifier): Int = { var result = Bytes.compareTo(rowKey, o.rowKey) if (result == 0) { result = Bytes.compareTo(family, o.family) if (result == 0) result = Bytes.compareTo(qualifier, o.qualifier) } result } override def toString: String = { Bytes.toString(rowKey) + ":" + Bytes.toString(family) + ":" + Bytes.toString(qualifier) } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/NewHBaseRDD.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.InputFormat import org.apache.spark.rdd.NewHadoopRDD import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext} class NewHBaseRDD[K,V](@transient sc : SparkContext, @transient inputFormatClass: Class[_ <: InputFormat[K, V]], @transient keyClass: Class[K], @transient valueClass: Class[V], @transient conf: Configuration, val hBaseContext: HBaseContext) extends NewHadoopRDD(sc,inputFormatClass, keyClass, valueClass, conf) { override def compute(theSplit: Partition, context: TaskContext): InterruptibleIterator[(K, V)] = { super.compute(theSplit, context) } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/datasources/Bound.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.datasources import org.apache.hadoop.hbase.spark.hbase._ /** * The Bound represent the boudary for the scan * * @param b The byte array of the bound * @param inc inclusive or not. */ case class Bound(b: Array[Byte], inc: Boolean) // The non-overlapping ranges we need to scan, if lower is equal to upper, it is a get request case class Range(lower: Option[Bound], upper: Option[Bound]) object Range { def apply(region: HBaseRegion): Range = { Range(region.start.map(Bound(_, true)), if (region.end.get.size == 0) { None } else { region.end.map((Bound(_, false))) }) } } object Ranges { // We assume that // 1. r.lower.inc is true, and r.upper.inc is false // 2. for each range in rs, its upper.inc is false def and(r: Range, rs: Seq[Range]): Seq[Range] = { rs.flatMap{ s => val lower = s.lower.map { x => // the scan has lower bound r.lower.map { y => // the region has lower bound if (ord.compare(x.b, y.b) < 0) { // scan lower bound is smaller than region server lower bound Some(y) } else { // scan low bound is greater or equal to region server lower bound Some(x) } }.getOrElse(Some(x)) }.getOrElse(r.lower) val upper = s.upper.map { x => // the scan has upper bound r.upper.map { y => // the region has upper bound if (ord.compare(x.b, y.b) >= 0) { // scan upper bound is larger than server upper bound // but region server scan stop is exclusive. It is OK here. Some(y) } else { // scan upper bound is less or equal to region server upper bound Some(x) } }.getOrElse(Some(x)) }.getOrElse(r.upper) val c = lower.map { case x => upper.map { case y => ord.compare(x.b, y.b) }.getOrElse(-1) }.getOrElse(-1) if (c < 0) { Some(Range(lower, upper)) } else { None } }.seq } } object Points { def and(r: Range, ps: Seq[Array[Byte]]): Seq[Array[Byte]] = { ps.flatMap { p => if (ord.compare(r.lower.get.b, p) <= 0) { // if region lower bound is less or equal to the point if (r.upper.isDefined) { // if region upper bound is defined if (ord.compare(r.upper.get.b, p) > 0) { // if the upper bound is greater than the point (because upper bound is exclusive) Some(p) } else { None } } else { // if the region upper bound is not defined (infinity) Some(p) } } else { None } } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/datasources/HBaseResources.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.datasources import org.apache.hadoop.hbase.TableName import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.spark.HBaseRelation import scala.language.implicitConversions // Resource and ReferencedResources are defined for extensibility, // e.g., consolidate scan and bulkGet in the future work. // User has to invoke release explicitly to release the resource, // and potentially parent resources trait Resource { def release(): Unit } case class ScanResource(tbr: TableResource, rs: ResultScanner) extends Resource { def release() { rs.close() tbr.release() } } case class GetResource(tbr: TableResource, rs: Array[Result]) extends Resource { def release() { tbr.release() } } trait ReferencedResource { var count: Int = 0 def init(): Unit def destroy(): Unit def acquire() = synchronized { try { count += 1 if (count == 1) { init() } } catch { case e: Throwable => release() throw e } } def release() = synchronized { count -= 1 if (count == 0) { destroy() } } def releaseOnException[T](func: => T): T = { acquire() val ret = { try { func } catch { case e: Throwable => release() throw e } } ret } } case class TableResource(relation: HBaseRelation) extends ReferencedResource { var connection: Connection = _ var table: Table = _ override def init(): Unit = { connection = ConnectionFactory.createConnection(relation.hbaseConf) table = connection.getTable(TableName.valueOf(relation.tableName)) } override def destroy(): Unit = { if (table != null) { table.close() table = null } if (connection != null) { connection.close() connection = null } } def getScanner(scan: Scan): ScanResource = releaseOnException { ScanResource(this, table.getScanner(scan)) } def get(list: java.util.List[org.apache.hadoop.hbase.client.Get]) = releaseOnException { GetResource(this, table.get(list)) } } case class RegionResource(relation: HBaseRelation) extends ReferencedResource { var connection: Connection = _ var rl: RegionLocator = _ val regions = releaseOnException { val keys = rl.getStartEndKeys keys.getFirst.zip(keys.getSecond) .zipWithIndex .map(x => HBaseRegion(x._2, Some(x._1._1), Some(x._1._2), Some(rl.getRegionLocation(x._1._1).getHostname))) } override def init(): Unit = { connection = ConnectionFactory.createConnection(relation.hbaseConf) rl = connection.getRegionLocator(TableName.valueOf(relation.tableName)) } override def destroy(): Unit = { if (rl != null) { rl.close() rl = null } if (connection != null) { connection.close() connection = null } } } object HBaseResources{ implicit def ScanResToScan(sr: ScanResource): ResultScanner = { sr.rs } implicit def GetResToResult(gr: GetResource): Array[Result] = { gr.rs } implicit def TableResToTable(tr: TableResource): Table = { tr.table } implicit def RegionResToRegions(rr: RegionResource): Seq[HBaseRegion] = { rr.regions } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/datasources/HBaseSparkConf.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.datasources object HBaseSparkConf{ // This is the hbase configuration. User can either set them in SparkConf, which // will take effect globally, or configure it per table, which will overwrite the value // set in SparkConf. If not setted, the default value will take effect. val BLOCK_CACHE_ENABLE = "spark.hbase.blockcache.enable" // default block cache is set to true by default following hbase convention, but note that // this potentially may slow down the system val defaultBlockCacheEnable = true val CACHE_SIZE = "spark.hbase.cacheSize" val defaultCachingSize = 1000 val BATCH_NUM = "spark.hbase.batchNum" val defaultBatchNum = 1000 val BULKGET_SIZE = "spark.hbase.bulkGetSize" val defaultBulkGetSize = 1000 } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/datasources/SerializableConfiguration.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.datasources import java.io.{IOException, ObjectInputStream, ObjectOutputStream} import org.apache.hadoop.conf.Configuration import scala.util.control.NonFatal class SerializableConfiguration(@transient var value: Configuration) extends Serializable { private def writeObject(out: ObjectOutputStream): Unit = tryOrIOException { out.defaultWriteObject() value.write(out) } private def readObject(in: ObjectInputStream): Unit = tryOrIOException { value = new Configuration(false) value.readFields(in) } def tryOrIOException(block: => Unit) { try { block } catch { case e: IOException => throw e case NonFatal(t) => throw new IOException(t) } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/datasources/package.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark import org.apache.hadoop.hbase.util.Bytes import scala.math.Ordering package object hbase { type HBaseType = Array[Byte] val ByteMax = -1.asInstanceOf[Byte] val ByteMin = 0.asInstanceOf[Byte] val ord: Ordering[HBaseType] = new Ordering[HBaseType] { def compare(x: Array[Byte], y: Array[Byte]): Int = { return Bytes.compareTo(x, y) } } //Do not use BinaryType.ordering implicit val order: Ordering[HBaseType] = ord } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkDeleteExample.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.spark.SparkContext import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.client.Delete import org.apache.spark.SparkConf /** * This is a simple example of deleting records in HBase * with the bulkDelete function. */ object HBaseBulkDeleteExample { def main(args: Array[String]) { if (args.length < 1) { println("HBaseBulkDeletesExample {tableName} ") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName) val sc = new SparkContext(sparkConf) try { //[Array[Byte]] val rdd = sc.parallelize(Array( Bytes.toBytes("1"), Bytes.toBytes("2"), Bytes.toBytes("3"), Bytes.toBytes("4"), Bytes.toBytes("5") )) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.bulkDelete[Array[Byte]](rdd, TableName.valueOf(tableName), putRecord => new Delete(putRecord), 4) } finally { sc.stop() } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkGetExample.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.spark.SparkContext import org.apache.hadoop.hbase.{CellUtil, TableName, HBaseConfiguration} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.client.Get import org.apache.hadoop.hbase.client.Result import org.apache.spark.SparkConf /** * This is a simple example of getting records in HBase * with the bulkGet function. */ object HBaseBulkGetExample { def main(args: Array[String]) { if (args.length < 1) { println("HBaseBulkGetExample {tableName}") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName) val sc = new SparkContext(sparkConf) try { //[(Array[Byte])] val rdd = sc.parallelize(Array( Bytes.toBytes("1"), Bytes.toBytes("2"), Bytes.toBytes("3"), Bytes.toBytes("4"), Bytes.toBytes("5"), Bytes.toBytes("6"), Bytes.toBytes("7"))) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) val getRdd = hbaseContext.bulkGet[Array[Byte], String]( TableName.valueOf(tableName), 2, rdd, record => { System.out.println("making Get") new Get(record) }, (result: Result) => { val it = result.listCells().iterator() val b = new StringBuilder b.append(Bytes.toString(result.getRow) + ":") while (it.hasNext) { val cell = it.next() val q = Bytes.toString(CellUtil.cloneQualifier(cell)) if (q.equals("counter")) { b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")") } else { b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")") } } b.toString() }) getRdd.collect().foreach(v => println(v)) } finally { sc.stop() } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkPutExample.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.spark.SparkContext import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.client.Put import org.apache.spark.SparkConf /** * This is a simple example of putting records in HBase * with the bulkPut function. */ object HBaseBulkPutExample { def main(args: Array[String]) { if (args.length < 2) { println("HBaseBulkPutExample {tableName} {columnFamily}") return } val tableName = args(0) val columnFamily = args(1) val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " + tableName + " " + columnFamily) val sc = new SparkContext(sparkConf) try { //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])] val rdd = sc.parallelize(Array( (Bytes.toBytes("1"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))), (Bytes.toBytes("2"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))), (Bytes.toBytes("3"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))), (Bytes.toBytes("4"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))), (Bytes.toBytes("5"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5")))) )) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd, TableName.valueOf(tableName), (putRecord) => { val put = new Put(putRecord._1) putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, putValue._3)) put }); } finally { sc.stop() } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkPutExampleFromFile.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.spark.SparkContext import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.mapred.TextInputFormat import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.spark.SparkConf /** * This is a simple example of putting records in HBase * with the bulkPut function. In this example we are * getting the put information from a file */ object HBaseBulkPutExampleFromFile { def main(args: Array[String]) { if (args.length < 3) { println("HBaseBulkPutExampleFromFile {tableName} {columnFamily} {inputFile}") return } val tableName = args(0) val columnFamily = args(1) val inputFile = args(2) val sparkConf = new SparkConf().setAppName("HBaseBulkPutExampleFromFile " + tableName + " " + columnFamily + " " + inputFile) val sc = new SparkContext(sparkConf) try { var rdd = sc.hadoopFile( inputFile, classOf[TextInputFormat], classOf[LongWritable], classOf[Text]).map(v => { System.out.println("reading-" + v._2.toString) v._2.toString }) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.bulkPut[String](rdd, TableName.valueOf(tableName), (putRecord) => { System.out.println("hbase-" + putRecord) val put = new Put(Bytes.toBytes("Value- " + putRecord)) put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("1"), Bytes.toBytes(putRecord.length())) put }); } finally { sc.stop() } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseBulkPutTimestampExample.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.spark.SparkContext import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.client.Put import org.apache.spark.SparkConf /** * This is a simple example of putting records in HBase * with the bulkPut function. In this example we are * also setting the timestamp in the put */ object HBaseBulkPutTimestampExample { def main(args: Array[String]) { if (args.length < 2) { System.out.println("HBaseBulkPutTimestampExample {tableName} {columnFamily}") return } val tableName = args(0) val columnFamily = args(1) val sparkConf = new SparkConf().setAppName("HBaseBulkPutTimestampExample " + tableName + " " + columnFamily) val sc = new SparkContext(sparkConf) try { val rdd = sc.parallelize(Array( (Bytes.toBytes("6"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))), (Bytes.toBytes("7"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))), (Bytes.toBytes("8"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))), (Bytes.toBytes("9"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))), (Bytes.toBytes("10"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5")))))) val conf = HBaseConfiguration.create() val timeStamp = System.currentTimeMillis() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd, TableName.valueOf(tableName), (putRecord) => { val put = new Put(putRecord._1) putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, timeStamp, putValue._3)) put }) } finally { sc.stop() } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseDistributedScanExample.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.spark.SparkContext import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.client.Scan import org.apache.spark.SparkConf /** * This is a simple example of scanning records from HBase * with the hbaseRDD function. */ object HBaseDistributedScanExample { def main(args: Array[String]) { if (args.length < 1) { println("GenerateGraphs {tableName}") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseDistributedScanExample " + tableName ) val sc = new SparkContext(sparkConf) try { val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) val scan = new Scan() scan.setCaching(100) val getRdd = hbaseContext.hbaseRDD(TableName.valueOf(tableName), scan) getRdd.foreach(v => println(Bytes.toString(v._1.get()))) println("Length: " + getRdd.map(r => r._1.copyBytes()).collect().length); //.collect().foreach(v => println(Bytes.toString(v._1.get()))) } finally { sc.stop() } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/example/hbasecontext/HBaseStreamingBulkPutExample.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.hbasecontext import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.spark.SparkContext import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.client.Put import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.Seconds import org.apache.spark.SparkConf /** * This is a simple example of BulkPut with Spark Streaming */ object HBaseStreamingBulkPutExample { def main(args: Array[String]) { if (args.length < 4) { println("HBaseStreamingBulkPutExample " + "{host} {port} {tableName} {columnFamily}") return } val host = args(0) val port = args(1) val tableName = args(2) val columnFamily = args(3) val sparkConf = new SparkConf().setAppName("HBaseBulkPutTimestampExample " + tableName + " " + columnFamily) val sc = new SparkContext(sparkConf) try { val ssc = new StreamingContext(sc, Seconds(1)) val lines = ssc.socketTextStream(host, port.toInt) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) hbaseContext.streamBulkPut[String](lines, TableName.valueOf(tableName), (putRecord) => { if (putRecord.length() > 0) { val put = new Put(Bytes.toBytes(putRecord)) put.addColumn(Bytes.toBytes("c"), Bytes.toBytes("foo"), Bytes.toBytes("bar")) put } else { null } }) ssc.start() ssc.awaitTerminationOrTimeout(60000) } finally { sc.stop() } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseBulkDeleteExample.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.rdd import org.apache.hadoop.hbase.client.Delete import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.{SparkContext, SparkConf} /** * This is a simple example of deleting records in HBase * with the bulkDelete function. */ object HBaseBulkDeleteExample { def main(args: Array[String]) { if (args.length < 1) { println("HBaseBulkDeletesExample {tableName} ") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseBulkDeleteExample " + tableName) val sc = new SparkContext(sparkConf) try { //[Array[Byte]] val rdd = sc.parallelize(Array( Bytes.toBytes("1"), Bytes.toBytes("2"), Bytes.toBytes("3"), Bytes.toBytes("4"), Bytes.toBytes("5") )) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) rdd.hbaseBulkDelete(hbaseContext, TableName.valueOf(tableName), putRecord => new Delete(putRecord), 4) } finally { sc.stop() } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseBulkGetExample.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.rdd import org.apache.hadoop.hbase.client.{Result, Get} import org.apache.hadoop.hbase.{CellUtil, TableName, HBaseConfiguration} import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ import org.apache.spark.{SparkContext, SparkConf} /** * This is a simple example of getting records in HBase * with the bulkGet function. */ object HBaseBulkGetExample { def main(args: Array[String]) { if (args.length < 1) { println("HBaseBulkGetExample {tableName}") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName) val sc = new SparkContext(sparkConf) try { //[(Array[Byte])] val rdd = sc.parallelize(Array( Bytes.toBytes("1"), Bytes.toBytes("2"), Bytes.toBytes("3"), Bytes.toBytes("4"), Bytes.toBytes("5"), Bytes.toBytes("6"), Bytes.toBytes("7"))) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) val getRdd = rdd.hbaseBulkGet[String](hbaseContext, TableName.valueOf(tableName), 2, record => { System.out.println("making Get") new Get(record) }, (result: Result) => { val it = result.listCells().iterator() val b = new StringBuilder b.append(Bytes.toString(result.getRow) + ":") while (it.hasNext) { val cell = it.next() val q = Bytes.toString(CellUtil.cloneQualifier(cell)) if (q.equals("counter")) { b.append("(" + q + "," + Bytes.toLong(CellUtil.cloneValue(cell)) + ")") } else { b.append("(" + q + "," + Bytes.toString(CellUtil.cloneValue(cell)) + ")") } } b.toString() }) getRdd.collect().foreach(v => println(v)) } finally { sc.stop() } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseBulkPutExample.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.rdd import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.spark.{SparkConf, SparkContext} /** * This is a simple example of putting records in HBase * with the bulkPut function. */ object HBaseBulkPutExample { def main(args: Array[String]) { if (args.length < 2) { println("HBaseBulkPutExample {tableName} {columnFamily}") return } val tableName = args(0) val columnFamily = args(1) val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " + tableName + " " + columnFamily) val sc = new SparkContext(sparkConf) try { //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])] val rdd = sc.parallelize(Array( (Bytes.toBytes("1"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))), (Bytes.toBytes("2"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))), (Bytes.toBytes("3"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))), (Bytes.toBytes("4"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))), (Bytes.toBytes("5"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5")))) )) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) rdd.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName), (putRecord) => { val put = new Put(putRecord._1) putRecord._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, putValue._3)) put }) } finally { sc.stop() } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseForeachPartitionExample.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.rdd import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.{SparkContext, SparkConf} /** * This is a simple example of using the foreachPartition * method with a HBase connection */ object HBaseForeachPartitionExample { def main(args: Array[String]) { if (args.length < 2) { println("HBaseBulkPutExample {tableName} {columnFamily}") return } val tableName = args(0) val columnFamily = args(1) val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " + tableName + " " + columnFamily) val sc = new SparkContext(sparkConf) try { //[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])] val rdd = sc.parallelize(Array( (Bytes.toBytes("1"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))), (Bytes.toBytes("2"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))), (Bytes.toBytes("3"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))), (Bytes.toBytes("4"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))), (Bytes.toBytes("5"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5")))) )) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) rdd.hbaseForeachPartition(hbaseContext, (it, connection) => { val m = connection.getBufferedMutator(TableName.valueOf(tableName)) it.foreach(r => { val put = new Put(r._1) r._2.foreach((putValue) => put.addColumn(putValue._1, putValue._2, putValue._3)) m.mutate(put) }) m.flush() m.close() }) } finally { sc.stop() } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala/org/apache/hadoop/hbase/spark/example/rdd/HBaseMapPartitionExample.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.example.rdd import org.apache.hadoop.hbase.client.Get import org.apache.hadoop.hbase.{TableName, HBaseConfiguration} import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._ import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.{SparkContext, SparkConf} /** * This is a simple example of using the mapPartitions * method with a HBase connection */ object HBaseMapPartitionExample { def main(args: Array[String]) { if (args.length < 1) { println("HBaseBulkGetExample {tableName}") return } val tableName = args(0) val sparkConf = new SparkConf().setAppName("HBaseBulkGetExample " + tableName) val sc = new SparkContext(sparkConf) try { //[(Array[Byte])] val rdd = sc.parallelize(Array( Bytes.toBytes("1"), Bytes.toBytes("2"), Bytes.toBytes("3"), Bytes.toBytes("4"), Bytes.toBytes("5"), Bytes.toBytes("6"), Bytes.toBytes("7"))) val conf = HBaseConfiguration.create() val hbaseContext = new HBaseContext(sc, conf) val getRdd = rdd.hbaseMapPartitions[String](hbaseContext, (it, connection) => { val table = connection.getTable(TableName.valueOf(tableName)) it.map{r => //batching would be faster. This is just an example val result = table.get(new Get(r)) val it = result.listCells().iterator() val b = new StringBuilder b.append(Bytes.toString(result.getRow) + ":") while (it.hasNext) { val cell = it.next() val q = Bytes.toString(cell.getQualifierArray) if (q.equals("counter")) { b.append("(" + q + "," + Bytes.toLong(cell.getValueArray) + ")") } else { b.append("(" + q + "," + Bytes.toString(cell.getValueArray) + ")") } } b.toString() } }) getRdd.collect().foreach(v => println(v)) } finally { sc.stop() } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala-spark-2.3/apache/hadoop/hbase/spark/datasources/HBaseTableScanRDD.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.datasources import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.spark._ import org.apache.hadoop.hbase.spark.datasources.HBaseResources._ import org.apache.hadoop.hbase.spark.hbase._ import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.{Partition, SparkEnv, TaskContext} import java.util.ArrayList import scala.collection.mutable class HBaseTableScanRDD(relation: HBaseRelation, val hbaseContext: HBaseContext, @transient val filter: Option[SparkSQLPushDownFilter] = None, val columns: Seq[SchemaQualifierDefinition] = Seq.empty )extends RDD[Result](relation.sqlContext.sparkContext, Nil) with Logging { private def sparkConf = SparkEnv.get.conf @transient var ranges = Seq.empty[Range] @transient var points = Seq.empty[Array[Byte]] def addPoint(p: Array[Byte]) { points :+= p } def addRange(r: ScanRange) = { val lower = if (r.lowerBound != null && r.lowerBound.length > 0) { Some(Bound(r.lowerBound, r.isLowerBoundEqualTo)) } else { None } val upper = if (r.upperBound != null && r.upperBound.length > 0) { if (!r.isUpperBoundEqualTo) { Some(Bound(r.upperBound, false)) } else { // HBase stopRow is exclusive: therefore it DOESN'T act like isUpperBoundEqualTo // by default. So we need to add a new max byte to the stopRow key val newArray = new Array[Byte](r.upperBound.length + 1) System.arraycopy(r.upperBound, 0, newArray, 0, r.upperBound.length) //New Max Bytes newArray(r.upperBound.length) = ByteMin Some(Bound(newArray, false)) } } else { None } ranges :+= Range(lower, upper) } override def getPartitions: Array[Partition] = { val regions = RegionResource(relation) var idx = 0 logDebug(s"There are ${regions.size} regions") val ps = regions.flatMap { x => val rs = Ranges.and(Range(x), ranges) val ps = Points.and(Range(x), points) if (rs.size > 0 || ps.size > 0) { if(log.isDebugEnabled) { rs.foreach(x => logDebug(x.toString)) } idx += 1 Some(HBaseScanPartition(idx - 1, x, rs, ps, SerializedFilter.toSerializedTypedFilter(filter))) } else { None } }.toArray regions.release() ps.asInstanceOf[Array[Partition]] } override def getPreferredLocations(split: Partition): Seq[String] = { split.asInstanceOf[HBaseScanPartition].regions.server.map { identity }.toSeq } private def buildGets( tbr: TableResource, g: Seq[Array[Byte]], filter: Option[SparkSQLPushDownFilter], columns: Seq[SchemaQualifierDefinition], hbaseContext: HBaseContext): Iterator[Result] = { g.grouped(relation.bulkGetSize).flatMap{ x => val gets = new ArrayList[Get]() x.foreach{ y => val g = new Get(y) columns.foreach { d => if (d.columnFamilyBytes.length > 0) { g.addColumn(d.columnFamilyBytes, d.qualifierBytes) } } filter.foreach(g.setFilter(_)) gets.add(g) } val tmp = tbr.get(gets) rddResources.addResource(tmp) toResultIterator(tmp) } } private def toResultIterator(result: GetResource): Iterator[Result] = { val iterator = new Iterator[Result] { var idx = 0 var cur: Option[Result] = None override def hasNext: Boolean = { while(idx < result.length && cur.isEmpty) { val r = result(idx) idx += 1 if (!r.isEmpty) { cur = Some(r) } } if (cur.isEmpty) { rddResources.release(result) } cur.isDefined } override def next(): Result = { hasNext val ret = cur.get cur = None ret } } iterator } private def buildScan(range: Range, filter: Option[SparkSQLPushDownFilter], columns: Seq[SchemaQualifierDefinition]): Scan = { val scan = (range.lower, range.upper) match { case (Some(Bound(a, b)), Some(Bound(c, d))) => new Scan(a, c) case (None, Some(Bound(c, d))) => new Scan(Array[Byte](), c) case (Some(Bound(a, b)), None) => new Scan(a) case (None, None) => new Scan() } columns.foreach { d => if (d.columnFamilyBytes.length > 0) { scan.addColumn(d.columnFamilyBytes, d.qualifierBytes) } } scan.setCacheBlocks(relation.blockCacheEnable) scan.setBatch(relation.batchNum) scan.setCaching(relation.cacheSize) filter.foreach(scan.setFilter(_)) scan } private def toResultIterator(scanner: ScanResource): Iterator[Result] = { val iterator = new Iterator[Result] { var cur: Option[Result] = None override def hasNext: Boolean = { if (cur.isEmpty) { val r = scanner.next() if (r == null) { rddResources.release(scanner) } else { cur = Some(r) } } cur.isDefined } override def next(): Result = { hasNext val ret = cur.get cur = None ret } } iterator } lazy val rddResources = RDDResources(new mutable.HashSet[Resource]()) private def close() { rddResources.release() } override def compute(split: Partition, context: TaskContext): Iterator[Result] = { val partition = split.asInstanceOf[HBaseScanPartition] val filter = SerializedFilter.fromSerializedFilter(partition.sf) val scans = partition.scanRanges .map(buildScan(_, filter, columns)) val tableResource = TableResource(relation) context.addTaskCompletionListener(context => close()) val points = partition.points val gIt: Iterator[Result] = { if (points.isEmpty) { Iterator.empty: Iterator[Result] } else { buildGets(tableResource, points, filter, columns, hbaseContext) } } val rIts = scans.par .map { scan => val scanner = tableResource.getScanner(scan) rddResources.addResource(scanner) scanner }.map(toResultIterator(_)) .fold(Iterator.empty: Iterator[Result]){ case (x, y) => x ++ y } ++ gIt rIts } } case class SerializedFilter(b: Option[Array[Byte]]) object SerializedFilter { def toSerializedTypedFilter(f: Option[SparkSQLPushDownFilter]): SerializedFilter = { SerializedFilter(f.map(_.toByteArray)) } def fromSerializedFilter(sf: SerializedFilter): Option[SparkSQLPushDownFilter] = { sf.b.map(SparkSQLPushDownFilter.parseFrom(_)) } } private[hbase] case class HBaseRegion( override val index: Int, val start: Option[HBaseType] = None, val end: Option[HBaseType] = None, val server: Option[String] = None) extends Partition private[hbase] case class HBaseScanPartition( override val index: Int, val regions: HBaseRegion, val scanRanges: Seq[Range], val points: Seq[Array[Byte]], val sf: SerializedFilter) extends Partition case class RDDResources(set: mutable.HashSet[Resource]) { def addResource(s: Resource) { set += s } def release() { set.foreach(release(_)) } def release(rs: Resource) { try { rs.release() } finally { set.remove(rs) } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala-spark-2.4/apache/hadoop/hbase/spark/datasources/HBaseTableScanRDD.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.datasources import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.spark._ import org.apache.hadoop.hbase.spark.datasources.HBaseResources._ import org.apache.hadoop.hbase.spark.hbase._ import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.{Partition, SparkEnv, TaskContext} import java.util.ArrayList import scala.collection.mutable class HBaseTableScanRDD(relation: HBaseRelation, val hbaseContext: HBaseContext, @transient val filter: Option[SparkSQLPushDownFilter] = None, val columns: Seq[SchemaQualifierDefinition] = Seq.empty )extends RDD[Result](relation.sqlContext.sparkContext, Nil) with Logging { private def sparkConf = SparkEnv.get.conf @transient var ranges = Seq.empty[Range] @transient var points = Seq.empty[Array[Byte]] def addPoint(p: Array[Byte]) { points :+= p } def addRange(r: ScanRange) = { val lower = if (r.lowerBound != null && r.lowerBound.length > 0) { Some(Bound(r.lowerBound, r.isLowerBoundEqualTo)) } else { None } val upper = if (r.upperBound != null && r.upperBound.length > 0) { if (!r.isUpperBoundEqualTo) { Some(Bound(r.upperBound, false)) } else { // HBase stopRow is exclusive: therefore it DOESN'T act like isUpperBoundEqualTo // by default. So we need to add a new max byte to the stopRow key val newArray = new Array[Byte](r.upperBound.length + 1) System.arraycopy(r.upperBound, 0, newArray, 0, r.upperBound.length) //New Max Bytes newArray(r.upperBound.length) = ByteMin Some(Bound(newArray, false)) } } else { None } ranges :+= Range(lower, upper) } override def getPartitions: Array[Partition] = { val regions = RegionResource(relation) var idx = 0 logDebug(s"There are ${regions.size} regions") val ps = regions.flatMap { x => val rs = Ranges.and(Range(x), ranges) val ps = Points.and(Range(x), points) if (rs.size > 0 || ps.size > 0) { if(log.isDebugEnabled) { rs.foreach(x => logDebug(x.toString)) } idx += 1 Some(HBaseScanPartition(idx - 1, x, rs, ps, SerializedFilter.toSerializedTypedFilter(filter))) } else { None } }.toArray regions.release() ps.asInstanceOf[Array[Partition]] } override def getPreferredLocations(split: Partition): Seq[String] = { split.asInstanceOf[HBaseScanPartition].regions.server.map { identity }.toSeq } private def buildGets( tbr: TableResource, g: Seq[Array[Byte]], filter: Option[SparkSQLPushDownFilter], columns: Seq[SchemaQualifierDefinition], hbaseContext: HBaseContext): Iterator[Result] = { g.grouped(relation.bulkGetSize).flatMap{ x => val gets = new ArrayList[Get]() x.foreach{ y => val g = new Get(y) columns.foreach { d => if (d.columnFamilyBytes.length > 0) { g.addColumn(d.columnFamilyBytes, d.qualifierBytes) } } filter.foreach(g.setFilter(_)) gets.add(g) } val tmp = tbr.get(gets) rddResources.addResource(tmp) toResultIterator(tmp) } } private def toResultIterator(result: GetResource): Iterator[Result] = { val iterator = new Iterator[Result] { var idx = 0 var cur: Option[Result] = None override def hasNext: Boolean = { while(idx < result.length && cur.isEmpty) { val r = result(idx) idx += 1 if (!r.isEmpty) { cur = Some(r) } } if (cur.isEmpty) { rddResources.release(result) } cur.isDefined } override def next(): Result = { hasNext val ret = cur.get cur = None ret } } iterator } private def buildScan(range: Range, filter: Option[SparkSQLPushDownFilter], columns: Seq[SchemaQualifierDefinition]): Scan = { val scan = (range.lower, range.upper) match { case (Some(Bound(a, b)), Some(Bound(c, d))) => new Scan(a, c) case (None, Some(Bound(c, d))) => new Scan(Array[Byte](), c) case (Some(Bound(a, b)), None) => new Scan(a) case (None, None) => new Scan() } columns.foreach { d => if (d.columnFamilyBytes.length > 0) { scan.addColumn(d.columnFamilyBytes, d.qualifierBytes) } } scan.setCacheBlocks(relation.blockCacheEnable) scan.setBatch(relation.batchNum) scan.setCaching(relation.cacheSize) filter.foreach(scan.setFilter(_)) scan } private def toResultIterator(scanner: ScanResource): Iterator[Result] = { val iterator = new Iterator[Result] { var cur: Option[Result] = None override def hasNext: Boolean = { if (cur.isEmpty) { val r = scanner.next() if (r == null) { rddResources.release(scanner) } else { cur = Some(r) } } cur.isDefined } override def next(): Result = { hasNext val ret = cur.get cur = None ret } } iterator } lazy val rddResources = RDDResources(new mutable.HashSet[Resource]()) private def close() { rddResources.release() } override def compute(split: Partition, context: TaskContext): Iterator[Result] = { val partition = split.asInstanceOf[HBaseScanPartition] val filter = SerializedFilter.fromSerializedFilter(partition.sf) val scans = partition.scanRanges .map(buildScan(_, filter, columns)) val tableResource = TableResource(relation) context.addTaskCompletionListener(context => close()) val points = partition.points val gIt: Iterator[Result] = { if (points.isEmpty) { Iterator.empty: Iterator[Result] } else { buildGets(tableResource, points, filter, columns, hbaseContext) } } val rIts = scans.par .map { scan => val scanner = tableResource.getScanner(scan) rddResources.addResource(scanner) scanner }.map(toResultIterator(_)) .fold(Iterator.empty: Iterator[Result]){ case (x, y) => x ++ y } ++ gIt rIts } } case class SerializedFilter(b: Option[Array[Byte]]) object SerializedFilter { def toSerializedTypedFilter(f: Option[SparkSQLPushDownFilter]): SerializedFilter = { SerializedFilter(f.map(_.toByteArray)) } def fromSerializedFilter(sf: SerializedFilter): Option[SparkSQLPushDownFilter] = { sf.b.map(SparkSQLPushDownFilter.parseFrom(_)) } } private[hbase] case class HBaseRegion( override val index: Int, val start: Option[HBaseType] = None, val end: Option[HBaseType] = None, val server: Option[String] = None) extends Partition private[hbase] case class HBaseScanPartition( override val index: Int, val regions: HBaseRegion, val scanRanges: Seq[Range], val points: Seq[Array[Byte]], val sf: SerializedFilter) extends Partition case class RDDResources(set: mutable.HashSet[Resource]) { def addResource(s: Resource) { set += s } def release() { set.foreach(release(_)) } def release(rs: Resource) { try { rs.release() } finally { set.remove(rs) } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala-spark-3.0/org/apache/hadoop/hbase/spark/datasources/HBaseTableScanRDD.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.datasources import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.spark._ import org.apache.hadoop.hbase.spark.datasources.HBaseResources._ import org.apache.hadoop.hbase.spark.hbase._ import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.{Partition, SparkEnv, TaskContext} import java.util.ArrayList import scala.collection.mutable class HBaseTableScanRDD(relation: HBaseRelation, val hbaseContext: HBaseContext, @transient val filter: Option[SparkSQLPushDownFilter] = None, val columns: Seq[SchemaQualifierDefinition] = Seq.empty )extends RDD[Result](relation.sqlContext.sparkContext, Nil) with Logging { private def sparkConf = SparkEnv.get.conf @transient var ranges = Seq.empty[Range] @transient var points = Seq.empty[Array[Byte]] def addPoint(p: Array[Byte]) { points :+= p } def addRange(r: ScanRange) = { val lower = if (r.lowerBound != null && r.lowerBound.length > 0) { Some(Bound(r.lowerBound, r.isLowerBoundEqualTo)) } else { None } val upper = if (r.upperBound != null && r.upperBound.length > 0) { if (!r.isUpperBoundEqualTo) { Some(Bound(r.upperBound, false)) } else { // HBase stopRow is exclusive: therefore it DOESN'T act like isUpperBoundEqualTo // by default. So we need to add a new max byte to the stopRow key val newArray = new Array[Byte](r.upperBound.length + 1) System.arraycopy(r.upperBound, 0, newArray, 0, r.upperBound.length) //New Max Bytes newArray(r.upperBound.length) = ByteMin Some(Bound(newArray, false)) } } else { None } ranges :+= Range(lower, upper) } override def getPartitions: Array[Partition] = { val regions = RegionResource(relation) var idx = 0 logDebug(s"There are ${regions.size} regions") val ps = regions.flatMap { x => val rs = Ranges.and(Range(x), ranges) val ps = Points.and(Range(x), points) if (rs.size > 0 || ps.size > 0) { if(log.isDebugEnabled) { rs.foreach(x => logDebug(x.toString)) } idx += 1 Some(HBaseScanPartition(idx - 1, x, rs, ps, SerializedFilter.toSerializedTypedFilter(filter))) } else { None } }.toArray regions.release() ps.asInstanceOf[Array[Partition]] } override def getPreferredLocations(split: Partition): Seq[String] = { split.asInstanceOf[HBaseScanPartition].regions.server.map { identity }.toSeq } private def buildGets( tbr: TableResource, g: Seq[Array[Byte]], filter: Option[SparkSQLPushDownFilter], columns: Seq[SchemaQualifierDefinition], hbaseContext: HBaseContext): Iterator[Result] = { g.grouped(relation.bulkGetSize).flatMap{ x => val gets = new ArrayList[Get]() x.foreach{ y => val g = new Get(y) columns.foreach { d => if (d.columnFamilyBytes.length > 0) { g.addColumn(d.columnFamilyBytes, d.qualifierBytes) } } filter.foreach(g.setFilter(_)) gets.add(g) } val tmp = tbr.get(gets) rddResources.addResource(tmp) toResultIterator(tmp) } } private def toResultIterator(result: GetResource): Iterator[Result] = { val iterator = new Iterator[Result] { var idx = 0 var cur: Option[Result] = None override def hasNext: Boolean = { while(idx < result.length && cur.isEmpty) { val r = result(idx) idx += 1 if (!r.isEmpty) { cur = Some(r) } } if (cur.isEmpty) { rddResources.release(result) } cur.isDefined } override def next(): Result = { hasNext val ret = cur.get cur = None ret } } iterator } private def buildScan(range: Range, filter: Option[SparkSQLPushDownFilter], columns: Seq[SchemaQualifierDefinition]): Scan = { val scan = (range.lower, range.upper) match { case (Some(Bound(a, b)), Some(Bound(c, d))) => new Scan(a, c) case (None, Some(Bound(c, d))) => new Scan(Array[Byte](), c) case (Some(Bound(a, b)), None) => new Scan(a) case (None, None) => new Scan() } columns.foreach { d => if (d.columnFamilyBytes.length > 0) { scan.addColumn(d.columnFamilyBytes, d.qualifierBytes) } } scan.setCacheBlocks(relation.blockCacheEnable) scan.setBatch(relation.batchNum) scan.setCaching(relation.cacheSize) filter.foreach(scan.setFilter(_)) scan } private def toResultIterator(scanner: ScanResource): Iterator[Result] = { val iterator = new Iterator[Result] { var cur: Option[Result] = None override def hasNext: Boolean = { if (cur.isEmpty) { val r = scanner.next() if (r == null) { rddResources.release(scanner) } else { cur = Some(r) } } cur.isDefined } override def next(): Result = { hasNext val ret = cur.get cur = None ret } } iterator } lazy val rddResources = RDDResources(new mutable.HashSet[Resource]()) private def close() { rddResources.release() } override def compute(split: Partition, context: TaskContext): Iterator[Result] = { val partition = split.asInstanceOf[HBaseScanPartition] val filter = SerializedFilter.fromSerializedFilter(partition.sf) val scans = partition.scanRanges .map(buildScan(_, filter, columns)) val tableResource = TableResource(relation) context.addTaskCompletionListener[Unit](context => close()) val points = partition.points val gIt: Iterator[Result] = { if (points.isEmpty) { Iterator.empty: Iterator[Result] } else { buildGets(tableResource, points, filter, columns, hbaseContext) } } val rIts = scans.par .map { scan => val scanner = tableResource.getScanner(scan) rddResources.addResource(scanner) scanner }.map(toResultIterator(_)) .fold(Iterator.empty: Iterator[Result]){ case (x, y) => x ++ y } ++ gIt rIts } } case class SerializedFilter(b: Option[Array[Byte]]) object SerializedFilter { def toSerializedTypedFilter(f: Option[SparkSQLPushDownFilter]): SerializedFilter = { SerializedFilter(f.map(_.toByteArray)) } def fromSerializedFilter(sf: SerializedFilter): Option[SparkSQLPushDownFilter] = { sf.b.map(SparkSQLPushDownFilter.parseFrom(_)) } } private[hbase] case class HBaseRegion( override val index: Int, val start: Option[HBaseType] = None, val end: Option[HBaseType] = None, val server: Option[String] = None) extends Partition private[hbase] case class HBaseScanPartition( override val index: Int, val regions: HBaseRegion, val scanRanges: Seq[Range], val points: Seq[Array[Byte]], val sf: SerializedFilter) extends Partition case class RDDResources(set: mutable.HashSet[Resource]) { def addResource(s: Resource) { set += s } def release() { set.foreach(release(_)) } def release(rs: Resource) { try { rs.release() } finally { set.remove(rs) } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala-spark-3.0/org/apache/spark/deploy/SparkHadoopUtil.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.deploy import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream, File, IOException} import java.security.PrivilegedExceptionAction import java.text.DateFormat import java.util.{Arrays, Date, Locale} import scala.collection.JavaConverters._ import scala.collection.immutable.Map import scala.collection.mutable import scala.collection.mutable.HashMap import scala.util.control.NonFatal import com.google.common.primitives.Longs import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.hadoop.security.token.{Token, TokenIdentifier} import org.apache.hadoop.security.token.delegation.AbstractDelegationTokenIdentifier import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.internal.config.BUFFER_SIZE import org.apache.spark.util.Utils /** * Contains util methods to interact with Hadoop from Spark. */ class SparkHadoopUtil extends Logging { private val sparkConf = new SparkConf(false).loadFromSystemProperties(true) val conf: Configuration = newConfiguration(sparkConf) UserGroupInformation.setConfiguration(conf) /** * Runs the given function with a Hadoop UserGroupInformation as a thread local variable * (distributed to child threads), used for authenticating HDFS and YARN calls. * * IMPORTANT NOTE: If this function is going to be called repeated in the same process * you need to look https://issues.apache.org/jira/browse/HDFS-3545 and possibly * do a FileSystem.closeAllForUGI in order to avoid leaking Filesystems */ def runAsSparkUser(func: () => Unit): Unit = { createSparkUser().doAs(new PrivilegedExceptionAction[Unit] { def run: Unit = func() }) } def createSparkUser(): UserGroupInformation = { val user = Utils.getCurrentUserName() logDebug("creating UGI for user: " + user) val ugi = UserGroupInformation.createRemoteUser(user) transferCredentials(UserGroupInformation.getCurrentUser(), ugi) ugi } def transferCredentials(source: UserGroupInformation, dest: UserGroupInformation): Unit = { dest.addCredentials(source.getCredentials()) } /** * Appends S3-specific, spark.hadoop.*, and spark.buffer.size configurations to a Hadoop * configuration. */ def appendS3AndSparkHadoopHiveConfigurations( conf: SparkConf, hadoopConf: Configuration): Unit = { SparkHadoopUtil.appendS3AndSparkHadoopHiveConfigurations(conf, hadoopConf) } /** * Appends spark.hadoop.* configurations from a [[SparkConf]] to a Hadoop * configuration without the spark.hadoop. prefix. */ def appendSparkHadoopConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = { SparkHadoopUtil.appendSparkHadoopConfigs(conf, hadoopConf) } /** * Appends spark.hadoop.* configurations from a Map to another without the spark.hadoop. prefix. */ def appendSparkHadoopConfigs( srcMap: Map[String, String], destMap: HashMap[String, String]): Unit = { // Copy any "spark.hadoop.foo=bar" system properties into destMap as "foo=bar" for ((key, value) <- srcMap if key.startsWith("spark.hadoop.")) { destMap.put(key.substring("spark.hadoop.".length), value) } } def appendSparkHiveConfigs( srcMap: Map[String, String], destMap: HashMap[String, String]): Unit = { // Copy any "spark.hive.foo=bar" system properties into destMap as "hive.foo=bar" for ((key, value) <- srcMap if key.startsWith("spark.hive.")) { destMap.put(key.substring("spark.".length), value) } } /** * Return an appropriate (subclass) of Configuration. Creating config can initialize some Hadoop * subsystems. */ def newConfiguration(conf: SparkConf): Configuration = { val hadoopConf = SparkHadoopUtil.newConfiguration(conf) hadoopConf.addResource(SparkHadoopUtil.SPARK_HADOOP_CONF_FILE) hadoopConf } /** * Add any user credentials to the job conf which are necessary for running on a secure Hadoop * cluster. */ def addCredentials(conf: JobConf): Unit = { val jobCreds = conf.getCredentials() jobCreds.mergeAll(UserGroupInformation.getCurrentUser().getCredentials()) } def addCurrentUserCredentials(creds: Credentials): Unit = { UserGroupInformation.getCurrentUser.addCredentials(creds) } def loginUserFromKeytab(principalName: String, keytabFilename: String): Unit = { if (!new File(keytabFilename).exists()) { throw new SparkException(s"Keytab file: ${keytabFilename} does not exist") } else { logInfo("Attempting to login to Kerberos " + s"using principal: ${principalName} and keytab: ${keytabFilename}") UserGroupInformation.loginUserFromKeytab(principalName, keytabFilename) } } /** * Add or overwrite current user's credentials with serialized delegation tokens, * also confirms correct hadoop configuration is set. */ private[spark] def addDelegationTokens(tokens: Array[Byte], sparkConf: SparkConf): Unit = { UserGroupInformation.setConfiguration(newConfiguration(sparkConf)) val creds = deserialize(tokens) logInfo("Updating delegation tokens for current user.") logDebug(s"Adding/updating delegation tokens ${dumpTokens(creds)}") addCurrentUserCredentials(creds) } /** * Returns a function that can be called to find Hadoop FileSystem bytes read. If * getFSBytesReadOnThreadCallback is called from thread r at time t, the returned callback will * return the bytes read on r since t. */ private[spark] def getFSBytesReadOnThreadCallback(): () => Long = { val f = () => FileSystem.getAllStatistics.asScala.map(_.getThreadStatistics.getBytesRead).sum val baseline = (Thread.currentThread().getId, f()) /** * This function may be called in both spawned child threads and parent task thread (in * PythonRDD), and Hadoop FileSystem uses thread local variables to track the statistics. * So we need a map to track the bytes read from the child threads and parent thread, * summing them together to get the bytes read of this task. */ new Function0[Long] { private val bytesReadMap = new mutable.HashMap[Long, Long]() override def apply(): Long = { bytesReadMap.synchronized { bytesReadMap.put(Thread.currentThread().getId, f()) bytesReadMap.map { case (k, v) => v - (if (k == baseline._1) baseline._2 else 0) }.sum } } } } /** * Returns a function that can be called to find Hadoop FileSystem bytes written. If * getFSBytesWrittenOnThreadCallback is called from thread r at time t, the returned callback will * return the bytes written on r since t. * * @return None if the required method can't be found. */ private[spark] def getFSBytesWrittenOnThreadCallback(): () => Long = { val threadStats = FileSystem.getAllStatistics.asScala.map(_.getThreadStatistics) val f = () => threadStats.map(_.getBytesWritten).sum val baselineBytesWritten = f() () => f() - baselineBytesWritten } /** * Get [[FileStatus]] objects for all leaf children (files) under the given base path. If the * given path points to a file, return a single-element collection containing [[FileStatus]] of * that file. */ def listLeafStatuses(fs: FileSystem, basePath: Path): Seq[FileStatus] = { listLeafStatuses(fs, fs.getFileStatus(basePath)) } /** * Get [[FileStatus]] objects for all leaf children (files) under the given base path. If the * given path points to a file, return a single-element collection containing [[FileStatus]] of * that file. */ def listLeafStatuses(fs: FileSystem, baseStatus: FileStatus): Seq[FileStatus] = { def recurse(status: FileStatus): Seq[FileStatus] = { val (directories, leaves) = fs.listStatus(status.getPath).partition(_.isDirectory) leaves ++ directories.flatMap(f => listLeafStatuses(fs, f)) } if (baseStatus.isDirectory) recurse(baseStatus) else Seq(baseStatus) } def listLeafDirStatuses(fs: FileSystem, basePath: Path): Seq[FileStatus] = { listLeafDirStatuses(fs, fs.getFileStatus(basePath)) } def listLeafDirStatuses(fs: FileSystem, baseStatus: FileStatus): Seq[FileStatus] = { def recurse(status: FileStatus): Seq[FileStatus] = { val (directories, files) = fs.listStatus(status.getPath).partition(_.isDirectory) val leaves = if (directories.isEmpty) Seq(status) else Seq.empty[FileStatus] leaves ++ directories.flatMap(dir => listLeafDirStatuses(fs, dir)) } assert(baseStatus.isDirectory) recurse(baseStatus) } def isGlobPath(pattern: Path): Boolean = { pattern.toString.exists("{}[]*?\\".toSet.contains) } def globPath(pattern: Path): Seq[Path] = { val fs = pattern.getFileSystem(conf) globPath(fs, pattern) } def globPath(fs: FileSystem, pattern: Path): Seq[Path] = { Option(fs.globStatus(pattern)).map { statuses => statuses.map(_.getPath.makeQualified(fs.getUri, fs.getWorkingDirectory)).toSeq }.getOrElse(Seq.empty[Path]) } def globPathIfNecessary(pattern: Path): Seq[Path] = { if (isGlobPath(pattern)) globPath(pattern) else Seq(pattern) } def globPathIfNecessary(fs: FileSystem, pattern: Path): Seq[Path] = { if (isGlobPath(pattern)) globPath(fs, pattern) else Seq(pattern) } /** * Lists all the files in a directory with the specified prefix, and does not end with the * given suffix. The returned {{FileStatus}} instances are sorted by the modification times of * the respective files. */ def listFilesSorted( remoteFs: FileSystem, dir: Path, prefix: String, exclusionSuffix: String): Array[FileStatus] = { try { val fileStatuses = remoteFs.listStatus(dir, new PathFilter { override def accept(path: Path): Boolean = { val name = path.getName name.startsWith(prefix) && !name.endsWith(exclusionSuffix) } }) Arrays.sort(fileStatuses, (o1: FileStatus, o2: FileStatus) => Longs.compare(o1.getModificationTime, o2.getModificationTime)) fileStatuses } catch { case NonFatal(e) => logWarning("Error while attempting to list files from application staging dir", e) Array.empty } } private[spark] def getSuffixForCredentialsPath(credentialsPath: Path): Int = { val fileName = credentialsPath.getName fileName.substring( fileName.lastIndexOf(SparkHadoopUtil.SPARK_YARN_CREDS_COUNTER_DELIM) + 1).toInt } private val HADOOP_CONF_PATTERN = "(\\$\\{hadoopconf-[^\\}\\$\\s]+\\})".r.unanchored /** * Substitute variables by looking them up in Hadoop configs. Only variables that match the * ${hadoopconf- .. } pattern are substituted. */ def substituteHadoopVariables(text: String, hadoopConf: Configuration): String = { text match { case HADOOP_CONF_PATTERN(matched) => logDebug(text + " matched " + HADOOP_CONF_PATTERN) val key = matched.substring(13, matched.length() - 1) // remove ${hadoopconf- .. } val eval = Option[String](hadoopConf.get(key)) .map { value => logDebug("Substituted " + matched + " with " + value) text.replace(matched, value) } if (eval.isEmpty) { // The variable was not found in Hadoop configs, so return text as is. text } else { // Continue to substitute more variables. substituteHadoopVariables(eval.get, hadoopConf) } case _ => logDebug(text + " didn't match " + HADOOP_CONF_PATTERN) text } } /** * Dump the credentials' tokens to string values. * * @param credentials credentials * @return an iterator over the string values. If no credentials are passed in: an empty list */ private[spark] def dumpTokens(credentials: Credentials): Iterable[String] = { if (credentials != null) { credentials.getAllTokens.asScala.map(tokenToString) } else { Seq.empty } } /** * Convert a token to a string for logging. * If its an abstract delegation token, attempt to unmarshall it and then * print more details, including timestamps in human-readable form. * * @param token token to convert to a string * @return a printable string value. */ private[spark] def tokenToString(token: Token[_ <: TokenIdentifier]): String = { val df = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.SHORT, Locale.US) val buffer = new StringBuilder(128) buffer.append(token.toString) try { val ti = token.decodeIdentifier buffer.append("; ").append(ti) ti match { case dt: AbstractDelegationTokenIdentifier => // include human times and the renewer, which the HDFS tokens toString omits buffer.append("; Renewer: ").append(dt.getRenewer) buffer.append("; Issued: ").append(df.format(new Date(dt.getIssueDate))) buffer.append("; Max Date: ").append(df.format(new Date(dt.getMaxDate))) case _ => } } catch { case e: IOException => logDebug(s"Failed to decode $token: $e", e) } buffer.toString } def serialize(creds: Credentials): Array[Byte] = { val byteStream = new ByteArrayOutputStream val dataStream = new DataOutputStream(byteStream) creds.writeTokenStorageToStream(dataStream) byteStream.toByteArray } def deserialize(tokenBytes: Array[Byte]): Credentials = { val tokensBuf = new ByteArrayInputStream(tokenBytes) val creds = new Credentials() creds.readTokenStorageStream(new DataInputStream(tokensBuf)) creds } def isProxyUser(ugi: UserGroupInformation): Boolean = { ugi.getAuthenticationMethod() == UserGroupInformation.AuthenticationMethod.PROXY } } object SparkHadoopUtil extends Logging { private lazy val instance = new SparkHadoopUtil val SPARK_YARN_CREDS_TEMP_EXTENSION = ".tmp" val SPARK_YARN_CREDS_COUNTER_DELIM = "-" /** * Number of records to update input metrics when reading from HadoopRDDs. * * Each update is potentially expensive because we need to use reflection to access the * Hadoop FileSystem API of interest (only available in 2.5), so we should do this sparingly. */ private[spark] val UPDATE_INPUT_METRICS_INTERVAL_RECORDS = 1000 /** * Name of the file containing the gateway's Hadoop configuration, to be overlayed on top of the * cluster's Hadoop config. It is up to the Spark code launching the application to create * this file if it's desired. If the file doesn't exist, it will just be ignored. */ private[spark] val SPARK_HADOOP_CONF_FILE = "__spark_hadoop_conf__.xml" def get: SparkHadoopUtil = instance /** * Returns a Configuration object with Spark configuration applied on top. Unlike * the instance method, this will always return a Configuration instance, and not a * cluster manager-specific type. */ private[spark] def newConfiguration(conf: SparkConf): Configuration = { val hadoopConf = new Configuration() appendS3AndSparkHadoopHiveConfigurations(conf, hadoopConf) hadoopConf } private def appendS3AndSparkHadoopHiveConfigurations( conf: SparkConf, hadoopConf: Configuration): Unit = { // Note: this null check is around more than just access to the "conf" object to maintain // the behavior of the old implementation of this code, for backwards compatibility. if (conf != null) { // Explicitly check for S3 environment variables val keyId = System.getenv("AWS_ACCESS_KEY_ID") val accessKey = System.getenv("AWS_SECRET_ACCESS_KEY") if (keyId != null && accessKey != null) { hadoopConf.set("fs.s3.awsAccessKeyId", keyId) hadoopConf.set("fs.s3n.awsAccessKeyId", keyId) hadoopConf.set("fs.s3a.access.key", keyId) hadoopConf.set("fs.s3.awsSecretAccessKey", accessKey) hadoopConf.set("fs.s3n.awsSecretAccessKey", accessKey) hadoopConf.set("fs.s3a.secret.key", accessKey) val sessionToken = System.getenv("AWS_SESSION_TOKEN") if (sessionToken != null) { hadoopConf.set("fs.s3a.session.token", sessionToken) } } appendHiveConfigs(hadoopConf) appendSparkHadoopConfigs(conf, hadoopConf) appendSparkHiveConfigs(conf, hadoopConf) val bufferSize = conf.get(BUFFER_SIZE).toString hadoopConf.set("io.file.buffer.size", bufferSize) } } private lazy val hiveConfKeys = { val configFile = Utils.getContextOrSparkClassLoader.getResource("hive-site.xml") if (configFile != null) { val conf = new Configuration(false) conf.addResource(configFile) conf.iterator().asScala.toSeq } else { Nil } } private def appendHiveConfigs(hadoopConf: Configuration): Unit = { hiveConfKeys.foreach { kv => hadoopConf.set(kv.getKey, kv.getValue) } } private def appendSparkHadoopConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = { // Copy any "spark.hadoop.foo=bar" spark properties into conf as "foo=bar" for ((key, value) <- conf.getAll if key.startsWith("spark.hadoop.")) { hadoopConf.set(key.substring("spark.hadoop.".length), value) } if (conf.getOption("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version").isEmpty) { hadoopConf.set("mapreduce.fileoutputcommitter.algorithm.version", "1") } } private def appendSparkHiveConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = { // Copy any "spark.hive.foo=bar" spark properties into conf as "hive.foo=bar" for ((key, value) <- conf.getAll if key.startsWith("spark.hive.")) { hadoopConf.set(key.substring("spark.".length), value) } } // scalastyle:off line.size.limit /** * Create a file on the given file system, optionally making sure erasure coding is disabled. * * Disabling EC can be helpful as HDFS EC doesn't support hflush(), hsync(), or append(). * Disabling EC can be helpful as HDFS EC doesn't support hflush(), hsync(), or append(). * https://hadoop.apache.org/docs/r3.0.0/hadoop-project-dist/hadoop-hdfs/HDFSErasureCoding.html#Limitations */ // scalastyle:on line.size.limit def createFile(fs: FileSystem, path: Path, allowEC: Boolean): FSDataOutputStream = { if (allowEC) { fs.create(path) } else { try { // Use reflection as this uses APIs only available in Hadoop 3 val builderMethod = fs.getClass().getMethod("createFile", classOf[Path]) // the builder api does not resolve relative paths, nor does it create parent dirs, while // the old api does. if (!fs.mkdirs(path.getParent())) { throw new IOException(s"Failed to create parents of $path") } val qualifiedPath = fs.makeQualified(path) val builder = builderMethod.invoke(fs, qualifiedPath) val builderCls = builder.getClass() // this may throw a NoSuchMethodException if the path is not on hdfs val replicateMethod = builderCls.getMethod("replicate") val buildMethod = builderCls.getMethod("build") val b2 = replicateMethod.invoke(builder) buildMethod.invoke(b2).asInstanceOf[FSDataOutputStream] } catch { case _: NoSuchMethodException => // No createFile() method, we're using an older hdfs client, which doesn't give us control // over EC vs. replication. Older hdfs doesn't have EC anyway, so just create a file with // old apis. fs.create(path) } } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala-spark-3.1/org/apache/hadoop/hbase/spark/datasources/HBaseTableScanRDD.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.datasources import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.spark._ import org.apache.hadoop.hbase.spark.datasources.HBaseResources._ import org.apache.hadoop.hbase.spark.hbase._ import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.{Partition, SparkEnv, TaskContext} import java.util.ArrayList import scala.collection.mutable class HBaseTableScanRDD(relation: HBaseRelation, val hbaseContext: HBaseContext, @transient val filter: Option[SparkSQLPushDownFilter] = None, val columns: Seq[SchemaQualifierDefinition] = Seq.empty )extends RDD[Result](relation.sqlContext.sparkContext, Nil) with Logging { private def sparkConf = SparkEnv.get.conf @transient var ranges = Seq.empty[Range] @transient var points = Seq.empty[Array[Byte]] def addPoint(p: Array[Byte]) { points :+= p } def addRange(r: ScanRange) = { val lower = if (r.lowerBound != null && r.lowerBound.length > 0) { Some(Bound(r.lowerBound, r.isLowerBoundEqualTo)) } else { None } val upper = if (r.upperBound != null && r.upperBound.length > 0) { if (!r.isUpperBoundEqualTo) { Some(Bound(r.upperBound, false)) } else { // HBase stopRow is exclusive: therefore it DOESN'T act like isUpperBoundEqualTo // by default. So we need to add a new max byte to the stopRow key val newArray = new Array[Byte](r.upperBound.length + 1) System.arraycopy(r.upperBound, 0, newArray, 0, r.upperBound.length) //New Max Bytes newArray(r.upperBound.length) = ByteMin Some(Bound(newArray, false)) } } else { None } ranges :+= Range(lower, upper) } override def getPartitions: Array[Partition] = { val regions = RegionResource(relation) var idx = 0 logDebug(s"There are ${regions.size} regions") val ps = regions.flatMap { x => val rs = Ranges.and(Range(x), ranges) val ps = Points.and(Range(x), points) if (rs.size > 0 || ps.size > 0) { if(log.isDebugEnabled) { rs.foreach(x => logDebug(x.toString)) } idx += 1 Some(HBaseScanPartition(idx - 1, x, rs, ps, SerializedFilter.toSerializedTypedFilter(filter))) } else { None } }.toArray regions.release() ps.asInstanceOf[Array[Partition]] } override def getPreferredLocations(split: Partition): Seq[String] = { split.asInstanceOf[HBaseScanPartition].regions.server.map { identity }.toSeq } private def buildGets( tbr: TableResource, g: Seq[Array[Byte]], filter: Option[SparkSQLPushDownFilter], columns: Seq[SchemaQualifierDefinition], hbaseContext: HBaseContext): Iterator[Result] = { g.grouped(relation.bulkGetSize).flatMap{ x => val gets = new ArrayList[Get]() x.foreach{ y => val g = new Get(y) columns.foreach { d => if (d.columnFamilyBytes.length > 0) { g.addColumn(d.columnFamilyBytes, d.qualifierBytes) } } filter.foreach(g.setFilter(_)) gets.add(g) } val tmp = tbr.get(gets) rddResources.addResource(tmp) toResultIterator(tmp) } } private def toResultIterator(result: GetResource): Iterator[Result] = { val iterator = new Iterator[Result] { var idx = 0 var cur: Option[Result] = None override def hasNext: Boolean = { while(idx < result.length && cur.isEmpty) { val r = result(idx) idx += 1 if (!r.isEmpty) { cur = Some(r) } } if (cur.isEmpty) { rddResources.release(result) } cur.isDefined } override def next(): Result = { hasNext val ret = cur.get cur = None ret } } iterator } private def buildScan(range: Range, filter: Option[SparkSQLPushDownFilter], columns: Seq[SchemaQualifierDefinition]): Scan = { val scan = (range.lower, range.upper) match { case (Some(Bound(a, b)), Some(Bound(c, d))) => new Scan(a, c) case (None, Some(Bound(c, d))) => new Scan(Array[Byte](), c) case (Some(Bound(a, b)), None) => new Scan(a) case (None, None) => new Scan() } columns.foreach { d => if (d.columnFamilyBytes.length > 0) { scan.addColumn(d.columnFamilyBytes, d.qualifierBytes) } } scan.setCacheBlocks(relation.blockCacheEnable) scan.setBatch(relation.batchNum) scan.setCaching(relation.cacheSize) filter.foreach(scan.setFilter(_)) scan } private def toResultIterator(scanner: ScanResource): Iterator[Result] = { val iterator = new Iterator[Result] { var cur: Option[Result] = None override def hasNext: Boolean = { if (cur.isEmpty) { val r = scanner.next() if (r == null) { rddResources.release(scanner) } else { cur = Some(r) } } cur.isDefined } override def next(): Result = { hasNext val ret = cur.get cur = None ret } } iterator } lazy val rddResources = RDDResources(new mutable.HashSet[Resource]()) private def close() { rddResources.release() } override def compute(split: Partition, context: TaskContext): Iterator[Result] = { val partition = split.asInstanceOf[HBaseScanPartition] val filter = SerializedFilter.fromSerializedFilter(partition.sf) val scans = partition.scanRanges .map(buildScan(_, filter, columns)) val tableResource = TableResource(relation) context.addTaskCompletionListener[Unit](context => close()) val points = partition.points val gIt: Iterator[Result] = { if (points.isEmpty) { Iterator.empty: Iterator[Result] } else { buildGets(tableResource, points, filter, columns, hbaseContext) } } val rIts = scans.par .map { scan => val scanner = tableResource.getScanner(scan) rddResources.addResource(scanner) scanner }.map(toResultIterator(_)) .fold(Iterator.empty: Iterator[Result]){ case (x, y) => x ++ y } ++ gIt rIts } } case class SerializedFilter(b: Option[Array[Byte]]) object SerializedFilter { def toSerializedTypedFilter(f: Option[SparkSQLPushDownFilter]): SerializedFilter = { SerializedFilter(f.map(_.toByteArray)) } def fromSerializedFilter(sf: SerializedFilter): Option[SparkSQLPushDownFilter] = { sf.b.map(SparkSQLPushDownFilter.parseFrom(_)) } } private[hbase] case class HBaseRegion( override val index: Int, val start: Option[HBaseType] = None, val end: Option[HBaseType] = None, val server: Option[String] = None) extends Partition private[hbase] case class HBaseScanPartition( override val index: Int, val regions: HBaseRegion, val scanRanges: Seq[Range], val points: Seq[Array[Byte]], val sf: SerializedFilter) extends Partition case class RDDResources(set: mutable.HashSet[Resource]) { def addResource(s: Resource) { set += s } def release() { set.foreach(release(_)) } def release(rs: Resource) { try { rs.release() } finally { set.remove(rs) } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala-spark-3.1/org/apache/spark/deploy/SparkHadoopUtil.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.deploy import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream, File, IOException} import java.security.PrivilegedExceptionAction import java.text.DateFormat import java.util.{Arrays, Date, Locale} import scala.collection.JavaConverters._ import scala.collection.immutable.Map import scala.collection.mutable import scala.collection.mutable.HashMap import scala.util.control.NonFatal import com.google.common.primitives.Longs import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.hadoop.security.token.{Token, TokenIdentifier} import org.apache.hadoop.security.token.delegation.AbstractDelegationTokenIdentifier import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.internal.config.BUFFER_SIZE import org.apache.spark.util.Utils /** * Contains util methods to interact with Hadoop from Spark. */ class SparkHadoopUtil extends Logging { private val sparkConf = new SparkConf(false).loadFromSystemProperties(true) val conf: Configuration = newConfiguration(sparkConf) UserGroupInformation.setConfiguration(conf) /** * Runs the given function with a Hadoop UserGroupInformation as a thread local variable * (distributed to child threads), used for authenticating HDFS and YARN calls. * * IMPORTANT NOTE: If this function is going to be called repeated in the same process * you need to look https://issues.apache.org/jira/browse/HDFS-3545 and possibly * do a FileSystem.closeAllForUGI in order to avoid leaking Filesystems */ def runAsSparkUser(func: () => Unit): Unit = { createSparkUser().doAs(new PrivilegedExceptionAction[Unit] { def run: Unit = func() }) } def createSparkUser(): UserGroupInformation = { val user = Utils.getCurrentUserName() logDebug("creating UGI for user: " + user) val ugi = UserGroupInformation.createRemoteUser(user) transferCredentials(UserGroupInformation.getCurrentUser(), ugi) ugi } def transferCredentials(source: UserGroupInformation, dest: UserGroupInformation): Unit = { dest.addCredentials(source.getCredentials()) } /** * Appends S3-specific, spark.hadoop.*, and spark.buffer.size configurations to a Hadoop * configuration. */ def appendS3AndSparkHadoopHiveConfigurations( conf: SparkConf, hadoopConf: Configuration): Unit = { SparkHadoopUtil.appendS3AndSparkHadoopHiveConfigurations(conf, hadoopConf) } /** * Appends spark.hadoop.* configurations from a [[SparkConf]] to a Hadoop * configuration without the spark.hadoop. prefix. */ def appendSparkHadoopConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = { SparkHadoopUtil.appendSparkHadoopConfigs(conf, hadoopConf) } /** * Appends spark.hadoop.* configurations from a Map to another without the spark.hadoop. prefix. */ def appendSparkHadoopConfigs( srcMap: Map[String, String], destMap: HashMap[String, String]): Unit = { // Copy any "spark.hadoop.foo=bar" system properties into destMap as "foo=bar" for ((key, value) <- srcMap if key.startsWith("spark.hadoop.")) { destMap.put(key.substring("spark.hadoop.".length), value) } } def appendSparkHiveConfigs( srcMap: Map[String, String], destMap: HashMap[String, String]): Unit = { // Copy any "spark.hive.foo=bar" system properties into destMap as "hive.foo=bar" for ((key, value) <- srcMap if key.startsWith("spark.hive.")) { destMap.put(key.substring("spark.".length), value) } } /** * Return an appropriate (subclass) of Configuration. Creating config can initialize some Hadoop * subsystems. */ def newConfiguration(conf: SparkConf): Configuration = { val hadoopConf = SparkHadoopUtil.newConfiguration(conf) hadoopConf.addResource(SparkHadoopUtil.SPARK_HADOOP_CONF_FILE) hadoopConf } /** * Add any user credentials to the job conf which are necessary for running on a secure Hadoop * cluster. */ def addCredentials(conf: JobConf): Unit = { val jobCreds = conf.getCredentials() jobCreds.mergeAll(UserGroupInformation.getCurrentUser().getCredentials()) } def addCurrentUserCredentials(creds: Credentials): Unit = { UserGroupInformation.getCurrentUser.addCredentials(creds) } def loginUserFromKeytab(principalName: String, keytabFilename: String): Unit = { if (!new File(keytabFilename).exists()) { throw new SparkException(s"Keytab file: ${keytabFilename} does not exist") } else { logInfo("Attempting to login to Kerberos " + s"using principal: ${principalName} and keytab: ${keytabFilename}") UserGroupInformation.loginUserFromKeytab(principalName, keytabFilename) } } /** * Add or overwrite current user's credentials with serialized delegation tokens, * also confirms correct hadoop configuration is set. */ private[spark] def addDelegationTokens(tokens: Array[Byte], sparkConf: SparkConf): Unit = { UserGroupInformation.setConfiguration(newConfiguration(sparkConf)) val creds = deserialize(tokens) logInfo("Updating delegation tokens for current user.") logDebug(s"Adding/updating delegation tokens ${dumpTokens(creds)}") addCurrentUserCredentials(creds) } /** * Returns a function that can be called to find Hadoop FileSystem bytes read. If * getFSBytesReadOnThreadCallback is called from thread r at time t, the returned callback will * return the bytes read on r since t. */ private[spark] def getFSBytesReadOnThreadCallback(): () => Long = { val f = () => FileSystem.getAllStatistics.asScala.map(_.getThreadStatistics.getBytesRead).sum val baseline = (Thread.currentThread().getId, f()) /** * This function may be called in both spawned child threads and parent task thread (in * PythonRDD), and Hadoop FileSystem uses thread local variables to track the statistics. * So we need a map to track the bytes read from the child threads and parent thread, * summing them together to get the bytes read of this task. */ new Function0[Long] { private val bytesReadMap = new mutable.HashMap[Long, Long]() override def apply(): Long = { bytesReadMap.synchronized { bytesReadMap.put(Thread.currentThread().getId, f()) bytesReadMap.map { case (k, v) => v - (if (k == baseline._1) baseline._2 else 0) }.sum } } } } /** * Returns a function that can be called to find Hadoop FileSystem bytes written. If * getFSBytesWrittenOnThreadCallback is called from thread r at time t, the returned callback will * return the bytes written on r since t. * * @return None if the required method can't be found. */ private[spark] def getFSBytesWrittenOnThreadCallback(): () => Long = { val threadStats = FileSystem.getAllStatistics.asScala.map(_.getThreadStatistics) val f = () => threadStats.map(_.getBytesWritten).sum val baselineBytesWritten = f() () => f() - baselineBytesWritten } /** * Get [[FileStatus]] objects for all leaf children (files) under the given base path. If the * given path points to a file, return a single-element collection containing [[FileStatus]] of * that file. */ def listLeafStatuses(fs: FileSystem, basePath: Path): Seq[FileStatus] = { listLeafStatuses(fs, fs.getFileStatus(basePath)) } /** * Get [[FileStatus]] objects for all leaf children (files) under the given base path. If the * given path points to a file, return a single-element collection containing [[FileStatus]] of * that file. */ def listLeafStatuses(fs: FileSystem, baseStatus: FileStatus): Seq[FileStatus] = { def recurse(status: FileStatus): Seq[FileStatus] = { val (directories, leaves) = fs.listStatus(status.getPath).partition(_.isDirectory) leaves ++ directories.flatMap(f => listLeafStatuses(fs, f)) } if (baseStatus.isDirectory) recurse(baseStatus) else Seq(baseStatus) } def listLeafDirStatuses(fs: FileSystem, basePath: Path): Seq[FileStatus] = { listLeafDirStatuses(fs, fs.getFileStatus(basePath)) } def listLeafDirStatuses(fs: FileSystem, baseStatus: FileStatus): Seq[FileStatus] = { def recurse(status: FileStatus): Seq[FileStatus] = { val (directories, files) = fs.listStatus(status.getPath).partition(_.isDirectory) val leaves = if (directories.isEmpty) Seq(status) else Seq.empty[FileStatus] leaves ++ directories.flatMap(dir => listLeafDirStatuses(fs, dir)) } assert(baseStatus.isDirectory) recurse(baseStatus) } def isGlobPath(pattern: Path): Boolean = { pattern.toString.exists("{}[]*?\\".toSet.contains) } def globPath(pattern: Path): Seq[Path] = { val fs = pattern.getFileSystem(conf) globPath(fs, pattern) } def globPath(fs: FileSystem, pattern: Path): Seq[Path] = { Option(fs.globStatus(pattern)).map { statuses => statuses.map(_.getPath.makeQualified(fs.getUri, fs.getWorkingDirectory)).toSeq }.getOrElse(Seq.empty[Path]) } def globPathIfNecessary(pattern: Path): Seq[Path] = { if (isGlobPath(pattern)) globPath(pattern) else Seq(pattern) } def globPathIfNecessary(fs: FileSystem, pattern: Path): Seq[Path] = { if (isGlobPath(pattern)) globPath(fs, pattern) else Seq(pattern) } /** * Lists all the files in a directory with the specified prefix, and does not end with the * given suffix. The returned {{FileStatus}} instances are sorted by the modification times of * the respective files. */ def listFilesSorted( remoteFs: FileSystem, dir: Path, prefix: String, exclusionSuffix: String): Array[FileStatus] = { try { val fileStatuses = remoteFs.listStatus(dir, new PathFilter { override def accept(path: Path): Boolean = { val name = path.getName name.startsWith(prefix) && !name.endsWith(exclusionSuffix) } }) Arrays.sort(fileStatuses, (o1: FileStatus, o2: FileStatus) => Longs.compare(o1.getModificationTime, o2.getModificationTime)) fileStatuses } catch { case NonFatal(e) => logWarning("Error while attempting to list files from application staging dir", e) Array.empty } } private[spark] def getSuffixForCredentialsPath(credentialsPath: Path): Int = { val fileName = credentialsPath.getName fileName.substring( fileName.lastIndexOf(SparkHadoopUtil.SPARK_YARN_CREDS_COUNTER_DELIM) + 1).toInt } private val HADOOP_CONF_PATTERN = "(\\$\\{hadoopconf-[^\\}\\$\\s]+\\})".r.unanchored /** * Substitute variables by looking them up in Hadoop configs. Only variables that match the * ${hadoopconf- .. } pattern are substituted. */ def substituteHadoopVariables(text: String, hadoopConf: Configuration): String = { text match { case HADOOP_CONF_PATTERN(matched) => logDebug(text + " matched " + HADOOP_CONF_PATTERN) val key = matched.substring(13, matched.length() - 1) // remove ${hadoopconf- .. } val eval = Option[String](hadoopConf.get(key)) .map { value => logDebug("Substituted " + matched + " with " + value) text.replace(matched, value) } if (eval.isEmpty) { // The variable was not found in Hadoop configs, so return text as is. text } else { // Continue to substitute more variables. substituteHadoopVariables(eval.get, hadoopConf) } case _ => logDebug(text + " didn't match " + HADOOP_CONF_PATTERN) text } } /** * Dump the credentials' tokens to string values. * * @param credentials credentials * @return an iterator over the string values. If no credentials are passed in: an empty list */ private[spark] def dumpTokens(credentials: Credentials): Iterable[String] = { if (credentials != null) { credentials.getAllTokens.asScala.map(tokenToString) } else { Seq.empty } } /** * Convert a token to a string for logging. * If its an abstract delegation token, attempt to unmarshall it and then * print more details, including timestamps in human-readable form. * * @param token token to convert to a string * @return a printable string value. */ private[spark] def tokenToString(token: Token[_ <: TokenIdentifier]): String = { val df = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.SHORT, Locale.US) val buffer = new StringBuilder(128) buffer.append(token.toString) try { val ti = token.decodeIdentifier buffer.append("; ").append(ti) ti match { case dt: AbstractDelegationTokenIdentifier => // include human times and the renewer, which the HDFS tokens toString omits buffer.append("; Renewer: ").append(dt.getRenewer) buffer.append("; Issued: ").append(df.format(new Date(dt.getIssueDate))) buffer.append("; Max Date: ").append(df.format(new Date(dt.getMaxDate))) case _ => } } catch { case e: IOException => logDebug(s"Failed to decode $token: $e", e) } buffer.toString } def serialize(creds: Credentials): Array[Byte] = { val byteStream = new ByteArrayOutputStream val dataStream = new DataOutputStream(byteStream) creds.writeTokenStorageToStream(dataStream) byteStream.toByteArray } def deserialize(tokenBytes: Array[Byte]): Credentials = { val tokensBuf = new ByteArrayInputStream(tokenBytes) val creds = new Credentials() creds.readTokenStorageStream(new DataInputStream(tokensBuf)) creds } def isProxyUser(ugi: UserGroupInformation): Boolean = { ugi.getAuthenticationMethod() == UserGroupInformation.AuthenticationMethod.PROXY } } object SparkHadoopUtil extends Logging { private lazy val instance = new SparkHadoopUtil val SPARK_YARN_CREDS_TEMP_EXTENSION = ".tmp" val SPARK_YARN_CREDS_COUNTER_DELIM = "-" /** * Number of records to update input metrics when reading from HadoopRDDs. * * Each update is potentially expensive because we need to use reflection to access the * Hadoop FileSystem API of interest (only available in 2.5), so we should do this sparingly. */ private[spark] val UPDATE_INPUT_METRICS_INTERVAL_RECORDS = 1000 /** * Name of the file containing the gateway's Hadoop configuration, to be overlayed on top of the * cluster's Hadoop config. It is up to the Spark code launching the application to create * this file if it's desired. If the file doesn't exist, it will just be ignored. */ private[spark] val SPARK_HADOOP_CONF_FILE = "__spark_hadoop_conf__.xml" def get: SparkHadoopUtil = instance /** * Returns a Configuration object with Spark configuration applied on top. Unlike * the instance method, this will always return a Configuration instance, and not a * cluster manager-specific type. */ private[spark] def newConfiguration(conf: SparkConf): Configuration = { val hadoopConf = new Configuration() appendS3AndSparkHadoopHiveConfigurations(conf, hadoopConf) hadoopConf } private def appendS3AndSparkHadoopHiveConfigurations( conf: SparkConf, hadoopConf: Configuration): Unit = { // Note: this null check is around more than just access to the "conf" object to maintain // the behavior of the old implementation of this code, for backwards compatibility. if (conf != null) { // Explicitly check for S3 environment variables val keyId = System.getenv("AWS_ACCESS_KEY_ID") val accessKey = System.getenv("AWS_SECRET_ACCESS_KEY") if (keyId != null && accessKey != null) { hadoopConf.set("fs.s3.awsAccessKeyId", keyId) hadoopConf.set("fs.s3n.awsAccessKeyId", keyId) hadoopConf.set("fs.s3a.access.key", keyId) hadoopConf.set("fs.s3.awsSecretAccessKey", accessKey) hadoopConf.set("fs.s3n.awsSecretAccessKey", accessKey) hadoopConf.set("fs.s3a.secret.key", accessKey) val sessionToken = System.getenv("AWS_SESSION_TOKEN") if (sessionToken != null) { hadoopConf.set("fs.s3a.session.token", sessionToken) } } appendHiveConfigs(hadoopConf) appendSparkHadoopConfigs(conf, hadoopConf) appendSparkHiveConfigs(conf, hadoopConf) val bufferSize = conf.get(BUFFER_SIZE).toString hadoopConf.set("io.file.buffer.size", bufferSize) } } private lazy val hiveConfKeys = { val configFile = Utils.getContextOrSparkClassLoader.getResource("hive-site.xml") if (configFile != null) { val conf = new Configuration(false) conf.addResource(configFile) conf.iterator().asScala.toSeq } else { Nil } } private def appendHiveConfigs(hadoopConf: Configuration): Unit = { hiveConfKeys.foreach { kv => hadoopConf.set(kv.getKey, kv.getValue) } } private def appendSparkHadoopConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = { // Copy any "spark.hadoop.foo=bar" spark properties into conf as "foo=bar" for ((key, value) <- conf.getAll if key.startsWith("spark.hadoop.")) { hadoopConf.set(key.substring("spark.hadoop.".length), value) } if (conf.getOption("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version").isEmpty) { hadoopConf.set("mapreduce.fileoutputcommitter.algorithm.version", "1") } } private def appendSparkHiveConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = { // Copy any "spark.hive.foo=bar" spark properties into conf as "hive.foo=bar" for ((key, value) <- conf.getAll if key.startsWith("spark.hive.")) { hadoopConf.set(key.substring("spark.".length), value) } } // scalastyle:off line.size.limit /** * Create a file on the given file system, optionally making sure erasure coding is disabled. * * Disabling EC can be helpful as HDFS EC doesn't support hflush(), hsync(), or append(). * Disabling EC can be helpful as HDFS EC doesn't support hflush(), hsync(), or append(). * https://hadoop.apache.org/docs/r3.0.0/hadoop-project-dist/hadoop-hdfs/HDFSErasureCoding.html#Limitations */ // scalastyle:on line.size.limit def createFile(fs: FileSystem, path: Path, allowEC: Boolean): FSDataOutputStream = { if (allowEC) { fs.create(path) } else { try { // Use reflection as this uses APIs only available in Hadoop 3 val builderMethod = fs.getClass().getMethod("createFile", classOf[Path]) // the builder api does not resolve relative paths, nor does it create parent dirs, while // the old api does. if (!fs.mkdirs(path.getParent())) { throw new IOException(s"Failed to create parents of $path") } val qualifiedPath = fs.makeQualified(path) val builder = builderMethod.invoke(fs, qualifiedPath) val builderCls = builder.getClass() // this may throw a NoSuchMethodException if the path is not on hdfs val replicateMethod = builderCls.getMethod("replicate") val buildMethod = builderCls.getMethod("build") val b2 = replicateMethod.invoke(builder) buildMethod.invoke(b2).asInstanceOf[FSDataOutputStream] } catch { case _: NoSuchMethodException => // No createFile() method, we're using an older hdfs client, which doesn't give us control // over EC vs. replication. Older hdfs doesn't have EC anyway, so just create a file with // old apis. fs.create(path) } } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala-spark-3.2/org/apache/hadoop/hbase/spark/datasources/HBaseTableScanRDD.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.datasources import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.spark._ import org.apache.hadoop.hbase.spark.datasources.HBaseResources._ import org.apache.hadoop.hbase.spark.hbase._ import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.{Partition, SparkEnv, TaskContext} import java.util.ArrayList import scala.collection.mutable class HBaseTableScanRDD(relation: HBaseRelation, val hbaseContext: HBaseContext, @transient val filter: Option[SparkSQLPushDownFilter] = None, val columns: Seq[SchemaQualifierDefinition] = Seq.empty )extends RDD[Result](relation.sqlContext.sparkContext, Nil) with Logging { private def sparkConf = SparkEnv.get.conf @transient var ranges = Seq.empty[Range] @transient var points = Seq.empty[Array[Byte]] def addPoint(p: Array[Byte]) { points :+= p } def addRange(r: ScanRange) = { val lower = if (r.lowerBound != null && r.lowerBound.length > 0) { Some(Bound(r.lowerBound, r.isLowerBoundEqualTo)) } else { None } val upper = if (r.upperBound != null && r.upperBound.length > 0) { if (!r.isUpperBoundEqualTo) { Some(Bound(r.upperBound, false)) } else { // HBase stopRow is exclusive: therefore it DOESN'T act like isUpperBoundEqualTo // by default. So we need to add a new max byte to the stopRow key val newArray = new Array[Byte](r.upperBound.length + 1) System.arraycopy(r.upperBound, 0, newArray, 0, r.upperBound.length) //New Max Bytes newArray(r.upperBound.length) = ByteMin Some(Bound(newArray, false)) } } else { None } ranges :+= Range(lower, upper) } override def getPartitions: Array[Partition] = { val regions = RegionResource(relation) var idx = 0 logDebug(s"There are ${regions.size} regions") val ps = regions.flatMap { x => val rs = Ranges.and(Range(x), ranges) val ps = Points.and(Range(x), points) if (rs.size > 0 || ps.size > 0) { if(log.isDebugEnabled) { rs.foreach(x => logDebug(x.toString)) } idx += 1 Some(HBaseScanPartition(idx - 1, x, rs, ps, SerializedFilter.toSerializedTypedFilter(filter))) } else { None } }.toArray regions.release() ps.asInstanceOf[Array[Partition]] } override def getPreferredLocations(split: Partition): Seq[String] = { split.asInstanceOf[HBaseScanPartition].regions.server.map { identity }.toSeq } private def buildGets( tbr: TableResource, g: Seq[Array[Byte]], filter: Option[SparkSQLPushDownFilter], columns: Seq[SchemaQualifierDefinition], hbaseContext: HBaseContext): Iterator[Result] = { g.grouped(relation.bulkGetSize).flatMap{ x => val gets = new ArrayList[Get]() x.foreach{ y => val g = new Get(y) columns.foreach { d => if (d.columnFamilyBytes.length > 0) { g.addColumn(d.columnFamilyBytes, d.qualifierBytes) } } filter.foreach(g.setFilter(_)) gets.add(g) } val tmp = tbr.get(gets) rddResources.addResource(tmp) toResultIterator(tmp) } } private def toResultIterator(result: GetResource): Iterator[Result] = { val iterator = new Iterator[Result] { var idx = 0 var cur: Option[Result] = None override def hasNext: Boolean = { while(idx < result.length && cur.isEmpty) { val r = result(idx) idx += 1 if (!r.isEmpty) { cur = Some(r) } } if (cur.isEmpty) { rddResources.release(result) } cur.isDefined } override def next(): Result = { hasNext val ret = cur.get cur = None ret } } iterator } private def buildScan(range: Range, filter: Option[SparkSQLPushDownFilter], columns: Seq[SchemaQualifierDefinition]): Scan = { val scan = (range.lower, range.upper) match { case (Some(Bound(a, b)), Some(Bound(c, d))) => new Scan(a, c) case (None, Some(Bound(c, d))) => new Scan(Array[Byte](), c) case (Some(Bound(a, b)), None) => new Scan(a) case (None, None) => new Scan() } columns.foreach { d => if (d.columnFamilyBytes.length > 0) { scan.addColumn(d.columnFamilyBytes, d.qualifierBytes) } } scan.setCacheBlocks(relation.blockCacheEnable) scan.setBatch(relation.batchNum) scan.setCaching(relation.cacheSize) filter.foreach(scan.setFilter(_)) scan } private def toResultIterator(scanner: ScanResource): Iterator[Result] = { val iterator = new Iterator[Result] { var cur: Option[Result] = None override def hasNext: Boolean = { if (cur.isEmpty) { val r = scanner.next() if (r == null) { rddResources.release(scanner) } else { cur = Some(r) } } cur.isDefined } override def next(): Result = { hasNext val ret = cur.get cur = None ret } } iterator } lazy val rddResources = RDDResources(new mutable.HashSet[Resource]()) private def close() { rddResources.release() } override def compute(split: Partition, context: TaskContext): Iterator[Result] = { val partition = split.asInstanceOf[HBaseScanPartition] val filter = SerializedFilter.fromSerializedFilter(partition.sf) val scans = partition.scanRanges .map(buildScan(_, filter, columns)) val tableResource = TableResource(relation) context.addTaskCompletionListener[Unit](context => close()) val points = partition.points val gIt: Iterator[Result] = { if (points.isEmpty) { Iterator.empty: Iterator[Result] } else { buildGets(tableResource, points, filter, columns, hbaseContext) } } val rIts = scans.par .map { scan => val scanner = tableResource.getScanner(scan) rddResources.addResource(scanner) scanner }.map(toResultIterator(_)) .fold(Iterator.empty: Iterator[Result]){ case (x, y) => x ++ y } ++ gIt rIts } } case class SerializedFilter(b: Option[Array[Byte]]) object SerializedFilter { def toSerializedTypedFilter(f: Option[SparkSQLPushDownFilter]): SerializedFilter = { SerializedFilter(f.map(_.toByteArray)) } def fromSerializedFilter(sf: SerializedFilter): Option[SparkSQLPushDownFilter] = { sf.b.map(SparkSQLPushDownFilter.parseFrom(_)) } } private[hbase] case class HBaseRegion( override val index: Int, val start: Option[HBaseType] = None, val end: Option[HBaseType] = None, val server: Option[String] = None) extends Partition private[hbase] case class HBaseScanPartition( override val index: Int, val regions: HBaseRegion, val scanRanges: Seq[Range], val points: Seq[Array[Byte]], val sf: SerializedFilter) extends Partition case class RDDResources(set: mutable.HashSet[Resource]) { def addResource(s: Resource) { set += s } def release() { set.foreach(release(_)) } def release(rs: Resource) { try { rs.release() } finally { set.remove(rs) } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala-spark-3.2/org/apache/spark/deploy/SparkHadoopUtil.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.deploy import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream, File, IOException} import java.security.PrivilegedExceptionAction import java.text.DateFormat import java.util.{Arrays, Date, Locale} import scala.collection.JavaConverters._ import scala.collection.immutable.Map import scala.collection.mutable import scala.collection.mutable.HashMap import scala.util.control.NonFatal import com.google.common.primitives.Longs import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.hadoop.security.token.{Token, TokenIdentifier} import org.apache.hadoop.security.token.delegation.AbstractDelegationTokenIdentifier import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.internal.config.BUFFER_SIZE import org.apache.spark.util.Utils /** * Contains util methods to interact with Hadoop from Spark. */ class SparkHadoopUtil extends Logging { private val sparkConf = new SparkConf(false).loadFromSystemProperties(true) val conf: Configuration = newConfiguration(sparkConf) UserGroupInformation.setConfiguration(conf) /** * Runs the given function with a Hadoop UserGroupInformation as a thread local variable * (distributed to child threads), used for authenticating HDFS and YARN calls. * * IMPORTANT NOTE: If this function is going to be called repeated in the same process * you need to look https://issues.apache.org/jira/browse/HDFS-3545 and possibly * do a FileSystem.closeAllForUGI in order to avoid leaking Filesystems */ def runAsSparkUser(func: () => Unit): Unit = { createSparkUser().doAs(new PrivilegedExceptionAction[Unit] { def run: Unit = func() }) } def createSparkUser(): UserGroupInformation = { val user = Utils.getCurrentUserName() logDebug("creating UGI for user: " + user) val ugi = UserGroupInformation.createRemoteUser(user) transferCredentials(UserGroupInformation.getCurrentUser(), ugi) ugi } def transferCredentials(source: UserGroupInformation, dest: UserGroupInformation): Unit = { dest.addCredentials(source.getCredentials()) } /** * Appends S3-specific, spark.hadoop.*, and spark.buffer.size configurations to a Hadoop * configuration. */ def appendS3AndSparkHadoopHiveConfigurations( conf: SparkConf, hadoopConf: Configuration): Unit = { SparkHadoopUtil.appendS3AndSparkHadoopHiveConfigurations(conf, hadoopConf) } /** * Appends spark.hadoop.* configurations from a [[SparkConf]] to a Hadoop * configuration without the spark.hadoop. prefix. */ def appendSparkHadoopConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = { SparkHadoopUtil.appendSparkHadoopConfigs(conf, hadoopConf) } /** * Appends spark.hadoop.* configurations from a Map to another without the spark.hadoop. prefix. */ def appendSparkHadoopConfigs( srcMap: Map[String, String], destMap: HashMap[String, String]): Unit = { // Copy any "spark.hadoop.foo=bar" system properties into destMap as "foo=bar" for ((key, value) <- srcMap if key.startsWith("spark.hadoop.")) { destMap.put(key.substring("spark.hadoop.".length), value) } } def appendSparkHiveConfigs( srcMap: Map[String, String], destMap: HashMap[String, String]): Unit = { // Copy any "spark.hive.foo=bar" system properties into destMap as "hive.foo=bar" for ((key, value) <- srcMap if key.startsWith("spark.hive.")) { destMap.put(key.substring("spark.".length), value) } } /** * Return an appropriate (subclass) of Configuration. Creating config can initialize some Hadoop * subsystems. */ def newConfiguration(conf: SparkConf): Configuration = { val hadoopConf = SparkHadoopUtil.newConfiguration(conf) hadoopConf.addResource(SparkHadoopUtil.SPARK_HADOOP_CONF_FILE) hadoopConf } /** * Add any user credentials to the job conf which are necessary for running on a secure Hadoop * cluster. */ def addCredentials(conf: JobConf): Unit = { val jobCreds = conf.getCredentials() jobCreds.mergeAll(UserGroupInformation.getCurrentUser().getCredentials()) } def addCurrentUserCredentials(creds: Credentials): Unit = { UserGroupInformation.getCurrentUser.addCredentials(creds) } def loginUserFromKeytab(principalName: String, keytabFilename: String): Unit = { if (!new File(keytabFilename).exists()) { throw new SparkException(s"Keytab file: ${keytabFilename} does not exist") } else { logInfo("Attempting to login to Kerberos " + s"using principal: ${principalName} and keytab: ${keytabFilename}") UserGroupInformation.loginUserFromKeytab(principalName, keytabFilename) } } /** * Add or overwrite current user's credentials with serialized delegation tokens, * also confirms correct hadoop configuration is set. */ private[spark] def addDelegationTokens(tokens: Array[Byte], sparkConf: SparkConf): Unit = { UserGroupInformation.setConfiguration(newConfiguration(sparkConf)) val creds = deserialize(tokens) logInfo("Updating delegation tokens for current user.") logDebug(s"Adding/updating delegation tokens ${dumpTokens(creds)}") addCurrentUserCredentials(creds) } /** * Returns a function that can be called to find Hadoop FileSystem bytes read. If * getFSBytesReadOnThreadCallback is called from thread r at time t, the returned callback will * return the bytes read on r since t. */ private[spark] def getFSBytesReadOnThreadCallback(): () => Long = { val f = () => FileSystem.getAllStatistics.asScala.map(_.getThreadStatistics.getBytesRead).sum val baseline = (Thread.currentThread().getId, f()) /** * This function may be called in both spawned child threads and parent task thread (in * PythonRDD), and Hadoop FileSystem uses thread local variables to track the statistics. * So we need a map to track the bytes read from the child threads and parent thread, * summing them together to get the bytes read of this task. */ new Function0[Long] { private val bytesReadMap = new mutable.HashMap[Long, Long]() override def apply(): Long = { bytesReadMap.synchronized { bytesReadMap.put(Thread.currentThread().getId, f()) bytesReadMap.map { case (k, v) => v - (if (k == baseline._1) baseline._2 else 0) }.sum } } } } /** * Returns a function that can be called to find Hadoop FileSystem bytes written. If * getFSBytesWrittenOnThreadCallback is called from thread r at time t, the returned callback will * return the bytes written on r since t. * * @return None if the required method can't be found. */ private[spark] def getFSBytesWrittenOnThreadCallback(): () => Long = { val threadStats = FileSystem.getAllStatistics.asScala.map(_.getThreadStatistics) val f = () => threadStats.map(_.getBytesWritten).sum val baselineBytesWritten = f() () => f() - baselineBytesWritten } /** * Get [[FileStatus]] objects for all leaf children (files) under the given base path. If the * given path points to a file, return a single-element collection containing [[FileStatus]] of * that file. */ def listLeafStatuses(fs: FileSystem, basePath: Path): Seq[FileStatus] = { listLeafStatuses(fs, fs.getFileStatus(basePath)) } /** * Get [[FileStatus]] objects for all leaf children (files) under the given base path. If the * given path points to a file, return a single-element collection containing [[FileStatus]] of * that file. */ def listLeafStatuses(fs: FileSystem, baseStatus: FileStatus): Seq[FileStatus] = { def recurse(status: FileStatus): Seq[FileStatus] = { val (directories, leaves) = fs.listStatus(status.getPath).partition(_.isDirectory) leaves ++ directories.flatMap(f => listLeafStatuses(fs, f)) } if (baseStatus.isDirectory) recurse(baseStatus) else Seq(baseStatus) } def listLeafDirStatuses(fs: FileSystem, basePath: Path): Seq[FileStatus] = { listLeafDirStatuses(fs, fs.getFileStatus(basePath)) } def listLeafDirStatuses(fs: FileSystem, baseStatus: FileStatus): Seq[FileStatus] = { def recurse(status: FileStatus): Seq[FileStatus] = { val (directories, files) = fs.listStatus(status.getPath).partition(_.isDirectory) val leaves = if (directories.isEmpty) Seq(status) else Seq.empty[FileStatus] leaves ++ directories.flatMap(dir => listLeafDirStatuses(fs, dir)) } assert(baseStatus.isDirectory) recurse(baseStatus) } def isGlobPath(pattern: Path): Boolean = { pattern.toString.exists("{}[]*?\\".toSet.contains) } def globPath(pattern: Path): Seq[Path] = { val fs = pattern.getFileSystem(conf) globPath(fs, pattern) } def globPath(fs: FileSystem, pattern: Path): Seq[Path] = { Option(fs.globStatus(pattern)).map { statuses => statuses.map(_.getPath.makeQualified(fs.getUri, fs.getWorkingDirectory)).toSeq }.getOrElse(Seq.empty[Path]) } def globPathIfNecessary(pattern: Path): Seq[Path] = { if (isGlobPath(pattern)) globPath(pattern) else Seq(pattern) } def globPathIfNecessary(fs: FileSystem, pattern: Path): Seq[Path] = { if (isGlobPath(pattern)) globPath(fs, pattern) else Seq(pattern) } /** * Lists all the files in a directory with the specified prefix, and does not end with the * given suffix. The returned {{FileStatus}} instances are sorted by the modification times of * the respective files. */ def listFilesSorted( remoteFs: FileSystem, dir: Path, prefix: String, exclusionSuffix: String): Array[FileStatus] = { try { val fileStatuses = remoteFs.listStatus(dir, new PathFilter { override def accept(path: Path): Boolean = { val name = path.getName name.startsWith(prefix) && !name.endsWith(exclusionSuffix) } }) Arrays.sort(fileStatuses, (o1: FileStatus, o2: FileStatus) => Longs.compare(o1.getModificationTime, o2.getModificationTime)) fileStatuses } catch { case NonFatal(e) => logWarning("Error while attempting to list files from application staging dir", e) Array.empty } } private[spark] def getSuffixForCredentialsPath(credentialsPath: Path): Int = { val fileName = credentialsPath.getName fileName.substring( fileName.lastIndexOf(SparkHadoopUtil.SPARK_YARN_CREDS_COUNTER_DELIM) + 1).toInt } private val HADOOP_CONF_PATTERN = "(\\$\\{hadoopconf-[^\\}\\$\\s]+\\})".r.unanchored /** * Substitute variables by looking them up in Hadoop configs. Only variables that match the * ${hadoopconf- .. } pattern are substituted. */ def substituteHadoopVariables(text: String, hadoopConf: Configuration): String = { text match { case HADOOP_CONF_PATTERN(matched) => logDebug(text + " matched " + HADOOP_CONF_PATTERN) val key = matched.substring(13, matched.length() - 1) // remove ${hadoopconf- .. } val eval = Option[String](hadoopConf.get(key)) .map { value => logDebug("Substituted " + matched + " with " + value) text.replace(matched, value) } if (eval.isEmpty) { // The variable was not found in Hadoop configs, so return text as is. text } else { // Continue to substitute more variables. substituteHadoopVariables(eval.get, hadoopConf) } case _ => logDebug(text + " didn't match " + HADOOP_CONF_PATTERN) text } } /** * Dump the credentials' tokens to string values. * * @param credentials credentials * @return an iterator over the string values. If no credentials are passed in: an empty list */ private[spark] def dumpTokens(credentials: Credentials): Iterable[String] = { if (credentials != null) { credentials.getAllTokens.asScala.map(tokenToString) } else { Seq.empty } } /** * Convert a token to a string for logging. * If its an abstract delegation token, attempt to unmarshall it and then * print more details, including timestamps in human-readable form. * * @param token token to convert to a string * @return a printable string value. */ private[spark] def tokenToString(token: Token[_ <: TokenIdentifier]): String = { val df = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.SHORT, Locale.US) val buffer = new StringBuilder(128) buffer.append(token.toString) try { val ti = token.decodeIdentifier buffer.append("; ").append(ti) ti match { case dt: AbstractDelegationTokenIdentifier => // include human times and the renewer, which the HDFS tokens toString omits buffer.append("; Renewer: ").append(dt.getRenewer) buffer.append("; Issued: ").append(df.format(new Date(dt.getIssueDate))) buffer.append("; Max Date: ").append(df.format(new Date(dt.getMaxDate))) case _ => } } catch { case e: IOException => logDebug(s"Failed to decode $token: $e", e) } buffer.toString } def serialize(creds: Credentials): Array[Byte] = { val byteStream = new ByteArrayOutputStream val dataStream = new DataOutputStream(byteStream) creds.writeTokenStorageToStream(dataStream) byteStream.toByteArray } def deserialize(tokenBytes: Array[Byte]): Credentials = { val tokensBuf = new ByteArrayInputStream(tokenBytes) val creds = new Credentials() creds.readTokenStorageStream(new DataInputStream(tokensBuf)) creds } def isProxyUser(ugi: UserGroupInformation): Boolean = { ugi.getAuthenticationMethod() == UserGroupInformation.AuthenticationMethod.PROXY } } object SparkHadoopUtil extends Logging { private lazy val instance = new SparkHadoopUtil val SPARK_YARN_CREDS_TEMP_EXTENSION = ".tmp" val SPARK_YARN_CREDS_COUNTER_DELIM = "-" /** * Number of records to update input metrics when reading from HadoopRDDs. * * Each update is potentially expensive because we need to use reflection to access the * Hadoop FileSystem API of interest (only available in 2.5), so we should do this sparingly. */ private[spark] val UPDATE_INPUT_METRICS_INTERVAL_RECORDS = 1000 /** * Name of the file containing the gateway's Hadoop configuration, to be overlayed on top of the * cluster's Hadoop config. It is up to the Spark code launching the application to create * this file if it's desired. If the file doesn't exist, it will just be ignored. */ private[spark] val SPARK_HADOOP_CONF_FILE = "__spark_hadoop_conf__.xml" def get: SparkHadoopUtil = instance /** * Returns a Configuration object with Spark configuration applied on top. Unlike * the instance method, this will always return a Configuration instance, and not a * cluster manager-specific type. */ private[spark] def newConfiguration(conf: SparkConf): Configuration = { val hadoopConf = new Configuration() appendS3AndSparkHadoopHiveConfigurations(conf, hadoopConf) hadoopConf } private def appendS3AndSparkHadoopHiveConfigurations( conf: SparkConf, hadoopConf: Configuration): Unit = { // Note: this null check is around more than just access to the "conf" object to maintain // the behavior of the old implementation of this code, for backwards compatibility. if (conf != null) { // Explicitly check for S3 environment variables val keyId = System.getenv("AWS_ACCESS_KEY_ID") val accessKey = System.getenv("AWS_SECRET_ACCESS_KEY") if (keyId != null && accessKey != null) { hadoopConf.set("fs.s3.awsAccessKeyId", keyId) hadoopConf.set("fs.s3n.awsAccessKeyId", keyId) hadoopConf.set("fs.s3a.access.key", keyId) hadoopConf.set("fs.s3.awsSecretAccessKey", accessKey) hadoopConf.set("fs.s3n.awsSecretAccessKey", accessKey) hadoopConf.set("fs.s3a.secret.key", accessKey) val sessionToken = System.getenv("AWS_SESSION_TOKEN") if (sessionToken != null) { hadoopConf.set("fs.s3a.session.token", sessionToken) } } appendHiveConfigs(hadoopConf) appendSparkHadoopConfigs(conf, hadoopConf) appendSparkHiveConfigs(conf, hadoopConf) val bufferSize = conf.get(BUFFER_SIZE).toString hadoopConf.set("io.file.buffer.size", bufferSize) } } private lazy val hiveConfKeys = { val configFile = Utils.getContextOrSparkClassLoader.getResource("hive-site.xml") if (configFile != null) { val conf = new Configuration(false) conf.addResource(configFile) conf.iterator().asScala.toSeq } else { Nil } } private def appendHiveConfigs(hadoopConf: Configuration): Unit = { hiveConfKeys.foreach { kv => hadoopConf.set(kv.getKey, kv.getValue) } } private def appendSparkHadoopConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = { // Copy any "spark.hadoop.foo=bar" spark properties into conf as "foo=bar" for ((key, value) <- conf.getAll if key.startsWith("spark.hadoop.")) { hadoopConf.set(key.substring("spark.hadoop.".length), value) } if (conf.getOption("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version").isEmpty) { hadoopConf.set("mapreduce.fileoutputcommitter.algorithm.version", "1") } } private def appendSparkHiveConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = { // Copy any "spark.hive.foo=bar" spark properties into conf as "hive.foo=bar" for ((key, value) <- conf.getAll if key.startsWith("spark.hive.")) { hadoopConf.set(key.substring("spark.".length), value) } } // scalastyle:off line.size.limit /** * Create a file on the given file system, optionally making sure erasure coding is disabled. * * Disabling EC can be helpful as HDFS EC doesn't support hflush(), hsync(), or append(). * Disabling EC can be helpful as HDFS EC doesn't support hflush(), hsync(), or append(). * https://hadoop.apache.org/docs/r3.0.0/hadoop-project-dist/hadoop-hdfs/HDFSErasureCoding.html#Limitations */ // scalastyle:on line.size.limit def createFile(fs: FileSystem, path: Path, allowEC: Boolean): FSDataOutputStream = { if (allowEC) { fs.create(path) } else { try { // Use reflection as this uses APIs only available in Hadoop 3 val builderMethod = fs.getClass().getMethod("createFile", classOf[Path]) // the builder api does not resolve relative paths, nor does it create parent dirs, while // the old api does. if (!fs.mkdirs(path.getParent())) { throw new IOException(s"Failed to create parents of $path") } val qualifiedPath = fs.makeQualified(path) val builder = builderMethod.invoke(fs, qualifiedPath) val builderCls = builder.getClass() // this may throw a NoSuchMethodException if the path is not on hdfs val replicateMethod = builderCls.getMethod("replicate") val buildMethod = builderCls.getMethod("build") val b2 = replicateMethod.invoke(builder) buildMethod.invoke(b2).asInstanceOf[FSDataOutputStream] } catch { case _: NoSuchMethodException => // No createFile() method, we're using an older hdfs client, which doesn't give us control // over EC vs. replication. Older hdfs doesn't have EC anyway, so just create a file with // old apis. fs.create(path) } } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala-spark-3.3/org/apache/hadoop/hbase/spark/datasources/HBaseTableScanRDD.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.spark.datasources import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.spark._ import org.apache.hadoop.hbase.spark.datasources.HBaseResources._ import org.apache.hadoop.hbase.spark.hbase._ import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.{Partition, SparkEnv, TaskContext} import java.util.ArrayList import scala.collection.mutable class HBaseTableScanRDD(relation: HBaseRelation, val hbaseContext: HBaseContext, @transient val filter: Option[SparkSQLPushDownFilter] = None, val columns: Seq[SchemaQualifierDefinition] = Seq.empty )extends RDD[Result](relation.sqlContext.sparkContext, Nil) with Logging { private def sparkConf = SparkEnv.get.conf @transient var ranges = Seq.empty[Range] @transient var points = Seq.empty[Array[Byte]] def addPoint(p: Array[Byte]) { points :+= p } def addRange(r: ScanRange) = { val lower = if (r.lowerBound != null && r.lowerBound.length > 0) { Some(Bound(r.lowerBound, r.isLowerBoundEqualTo)) } else { None } val upper = if (r.upperBound != null && r.upperBound.length > 0) { if (!r.isUpperBoundEqualTo) { Some(Bound(r.upperBound, false)) } else { // HBase stopRow is exclusive: therefore it DOESN'T act like isUpperBoundEqualTo // by default. So we need to add a new max byte to the stopRow key val newArray = new Array[Byte](r.upperBound.length + 1) System.arraycopy(r.upperBound, 0, newArray, 0, r.upperBound.length) //New Max Bytes newArray(r.upperBound.length) = ByteMin Some(Bound(newArray, false)) } } else { None } ranges :+= Range(lower, upper) } override def getPartitions: Array[Partition] = { val regions = RegionResource(relation) var idx = 0 logDebug(s"There are ${regions.size} regions") val ps = regions.flatMap { x => val rs = Ranges.and(Range(x), ranges) val ps = Points.and(Range(x), points) if (rs.size > 0 || ps.size > 0) { if(log.isDebugEnabled) { rs.foreach(x => logDebug(x.toString)) } idx += 1 Some(HBaseScanPartition(idx - 1, x, rs, ps, SerializedFilter.toSerializedTypedFilter(filter))) } else { None } }.toArray regions.release() ps.asInstanceOf[Array[Partition]] } override def getPreferredLocations(split: Partition): Seq[String] = { split.asInstanceOf[HBaseScanPartition].regions.server.map { identity }.toSeq } private def buildGets( tbr: TableResource, g: Seq[Array[Byte]], filter: Option[SparkSQLPushDownFilter], columns: Seq[SchemaQualifierDefinition], hbaseContext: HBaseContext): Iterator[Result] = { g.grouped(relation.bulkGetSize).flatMap{ x => val gets = new ArrayList[Get]() x.foreach{ y => val g = new Get(y) columns.foreach { d => if (d.columnFamilyBytes.length > 0) { g.addColumn(d.columnFamilyBytes, d.qualifierBytes) } } filter.foreach(g.setFilter(_)) gets.add(g) } val tmp = tbr.get(gets) rddResources.addResource(tmp) toResultIterator(tmp) } } private def toResultIterator(result: GetResource): Iterator[Result] = { val iterator = new Iterator[Result] { var idx = 0 var cur: Option[Result] = None override def hasNext: Boolean = { while(idx < result.length && cur.isEmpty) { val r = result(idx) idx += 1 if (!r.isEmpty) { cur = Some(r) } } if (cur.isEmpty) { rddResources.release(result) } cur.isDefined } override def next(): Result = { hasNext val ret = cur.get cur = None ret } } iterator } private def buildScan(range: Range, filter: Option[SparkSQLPushDownFilter], columns: Seq[SchemaQualifierDefinition]): Scan = { val scan = (range.lower, range.upper) match { case (Some(Bound(a, b)), Some(Bound(c, d))) => new Scan(a, c) case (None, Some(Bound(c, d))) => new Scan(Array[Byte](), c) case (Some(Bound(a, b)), None) => new Scan(a) case (None, None) => new Scan() } columns.foreach { d => if (d.columnFamilyBytes.length > 0) { scan.addColumn(d.columnFamilyBytes, d.qualifierBytes) } } scan.setCacheBlocks(relation.blockCacheEnable) scan.setBatch(relation.batchNum) scan.setCaching(relation.cacheSize) filter.foreach(scan.setFilter(_)) scan } private def toResultIterator(scanner: ScanResource): Iterator[Result] = { val iterator = new Iterator[Result] { var cur: Option[Result] = None override def hasNext: Boolean = { if (cur.isEmpty) { val r = scanner.next() if (r == null) { rddResources.release(scanner) } else { cur = Some(r) } } cur.isDefined } override def next(): Result = { hasNext val ret = cur.get cur = None ret } } iterator } lazy val rddResources = RDDResources(new mutable.HashSet[Resource]()) private def close() { rddResources.release() } override def compute(split: Partition, context: TaskContext): Iterator[Result] = { val partition = split.asInstanceOf[HBaseScanPartition] val filter = SerializedFilter.fromSerializedFilter(partition.sf) val scans = partition.scanRanges .map(buildScan(_, filter, columns)) val tableResource = TableResource(relation) context.addTaskCompletionListener[Unit](context => close()) val points = partition.points val gIt: Iterator[Result] = { if (points.isEmpty) { Iterator.empty: Iterator[Result] } else { buildGets(tableResource, points, filter, columns, hbaseContext) } } val rIts = scans.par .map { scan => val scanner = tableResource.getScanner(scan) rddResources.addResource(scanner) scanner }.map(toResultIterator(_)) .fold(Iterator.empty: Iterator[Result]){ case (x, y) => x ++ y } ++ gIt rIts } } case class SerializedFilter(b: Option[Array[Byte]]) object SerializedFilter { def toSerializedTypedFilter(f: Option[SparkSQLPushDownFilter]): SerializedFilter = { SerializedFilter(f.map(_.toByteArray)) } def fromSerializedFilter(sf: SerializedFilter): Option[SparkSQLPushDownFilter] = { sf.b.map(SparkSQLPushDownFilter.parseFrom(_)) } } private[hbase] case class HBaseRegion( override val index: Int, val start: Option[HBaseType] = None, val end: Option[HBaseType] = None, val server: Option[String] = None) extends Partition private[hbase] case class HBaseScanPartition( override val index: Int, val regions: HBaseRegion, val scanRanges: Seq[Range], val points: Seq[Array[Byte]], val sf: SerializedFilter) extends Partition case class RDDResources(set: mutable.HashSet[Resource]) { def addResource(s: Resource) { set += s } def release() { set.foreach(release(_)) } def release(rs: Resource) { try { rs.release() } finally { set.remove(rs) } } } ================================================ FILE: fire-connectors/spark-connectors/spark-hbase/src/main/scala-spark-3.3/org/apache/spark/deploy/SparkHadoopUtil.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.deploy import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream, File, IOException} import java.security.PrivilegedExceptionAction import java.text.DateFormat import java.util.{Arrays, Date, Locale} import scala.collection.JavaConverters._ import scala.collection.immutable.Map import scala.collection.mutable import scala.collection.mutable.HashMap import scala.util.control.NonFatal import com.google.common.primitives.Longs import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.hadoop.security.token.{Token, TokenIdentifier} import org.apache.hadoop.security.token.delegation.AbstractDelegationTokenIdentifier import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.internal.config.BUFFER_SIZE import org.apache.spark.util.Utils /** * Contains util methods to interact with Hadoop from Spark. */ class SparkHadoopUtil extends Logging { private val sparkConf = new SparkConf(false).loadFromSystemProperties(true) val conf: Configuration = newConfiguration(sparkConf) UserGroupInformation.setConfiguration(conf) /** * Runs the given function with a Hadoop UserGroupInformation as a thread local variable * (distributed to child threads), used for authenticating HDFS and YARN calls. * * IMPORTANT NOTE: If this function is going to be called repeated in the same process * you need to look https://issues.apache.org/jira/browse/HDFS-3545 and possibly * do a FileSystem.closeAllForUGI in order to avoid leaking Filesystems */ def runAsSparkUser(func: () => Unit): Unit = { createSparkUser().doAs(new PrivilegedExceptionAction[Unit] { def run: Unit = func() }) } def createSparkUser(): UserGroupInformation = { val user = Utils.getCurrentUserName() logDebug("creating UGI for user: " + user) val ugi = UserGroupInformation.createRemoteUser(user) transferCredentials(UserGroupInformation.getCurrentUser(), ugi) ugi } def transferCredentials(source: UserGroupInformation, dest: UserGroupInformation): Unit = { dest.addCredentials(source.getCredentials()) } /** * Appends S3-specific, spark.hadoop.*, and spark.buffer.size configurations to a Hadoop * configuration. */ def appendS3AndSparkHadoopHiveConfigurations( conf: SparkConf, hadoopConf: Configuration): Unit = { SparkHadoopUtil.appendS3AndSparkHadoopHiveConfigurations(conf, hadoopConf) } /** * Appends spark.hadoop.* configurations from a [[SparkConf]] to a Hadoop * configuration without the spark.hadoop. prefix. */ def appendSparkHadoopConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = { SparkHadoopUtil.appendSparkHadoopConfigs(conf, hadoopConf) } /** * Appends spark.hadoop.* configurations from a Map to another without the spark.hadoop. prefix. */ def appendSparkHadoopConfigs( srcMap: Map[String, String], destMap: HashMap[String, String]): Unit = { // Copy any "spark.hadoop.foo=bar" system properties into destMap as "foo=bar" for ((key, value) <- srcMap if key.startsWith("spark.hadoop.")) { destMap.put(key.substring("spark.hadoop.".length), value) } } def appendSparkHiveConfigs( srcMap: Map[String, String], destMap: HashMap[String, String]): Unit = { // Copy any "spark.hive.foo=bar" system properties into destMap as "hive.foo=bar" for ((key, value) <- srcMap if key.startsWith("spark.hive.")) { destMap.put(key.substring("spark.".length), value) } } /** * Return an appropriate (subclass) of Configuration. Creating config can initialize some Hadoop * subsystems. */ def newConfiguration(conf: SparkConf): Configuration = { val hadoopConf = SparkHadoopUtil.newConfiguration(conf) hadoopConf.addResource(SparkHadoopUtil.SPARK_HADOOP_CONF_FILE) hadoopConf } /** * Add any user credentials to the job conf which are necessary for running on a secure Hadoop * cluster. */ def addCredentials(conf: JobConf): Unit = { val jobCreds = conf.getCredentials() jobCreds.mergeAll(UserGroupInformation.getCurrentUser().getCredentials()) } def addCurrentUserCredentials(creds: Credentials): Unit = { UserGroupInformation.getCurrentUser.addCredentials(creds) } def loginUserFromKeytab(principalName: String, keytabFilename: String): Unit = { if (!new File(keytabFilename).exists()) { throw new SparkException(s"Keytab file: ${keytabFilename} does not exist") } else { logInfo("Attempting to login to Kerberos " + s"using principal: ${principalName} and keytab: ${keytabFilename}") UserGroupInformation.loginUserFromKeytab(principalName, keytabFilename) } } /** * Add or overwrite current user's credentials with serialized delegation tokens, * also confirms correct hadoop configuration is set. */ private[spark] def addDelegationTokens(tokens: Array[Byte], sparkConf: SparkConf): Unit = { UserGroupInformation.setConfiguration(newConfiguration(sparkConf)) val creds = deserialize(tokens) logInfo("Updating delegation tokens for current user.") logDebug(s"Adding/updating delegation tokens ${dumpTokens(creds)}") addCurrentUserCredentials(creds) } /** * Returns a function that can be called to find Hadoop FileSystem bytes read. If * getFSBytesReadOnThreadCallback is called from thread r at time t, the returned callback will * return the bytes read on r since t. */ private[spark] def getFSBytesReadOnThreadCallback(): () => Long = { val f = () => FileSystem.getAllStatistics.asScala.map(_.getThreadStatistics.getBytesRead).sum val baseline = (Thread.currentThread().getId, f()) /** * This function may be called in both spawned child threads and parent task thread (in * PythonRDD), and Hadoop FileSystem uses thread local variables to track the statistics. * So we need a map to track the bytes read from the child threads and parent thread, * summing them together to get the bytes read of this task. */ new Function0[Long] { private val bytesReadMap = new mutable.HashMap[Long, Long]() override def apply(): Long = { bytesReadMap.synchronized { bytesReadMap.put(Thread.currentThread().getId, f()) bytesReadMap.map { case (k, v) => v - (if (k == baseline._1) baseline._2 else 0) }.sum } } } } /** * Returns a function that can be called to find Hadoop FileSystem bytes written. If * getFSBytesWrittenOnThreadCallback is called from thread r at time t, the returned callback will * return the bytes written on r since t. * * @return None if the required method can't be found. */ private[spark] def getFSBytesWrittenOnThreadCallback(): () => Long = { val threadStats = FileSystem.getAllStatistics.asScala.map(_.getThreadStatistics) val f = () => threadStats.map(_.getBytesWritten).sum val baselineBytesWritten = f() () => f() - baselineBytesWritten } /** * Get [[FileStatus]] objects for all leaf children (files) under the given base path. If the * given path points to a file, return a single-element collection containing [[FileStatus]] of * that file. */ def listLeafStatuses(fs: FileSystem, basePath: Path): Seq[FileStatus] = { listLeafStatuses(fs, fs.getFileStatus(basePath)) } /** * Get [[FileStatus]] objects for all leaf children (files) under the given base path. If the * given path points to a file, return a single-element collection containing [[FileStatus]] of * that file. */ def listLeafStatuses(fs: FileSystem, baseStatus: FileStatus): Seq[FileStatus] = { def recurse(status: FileStatus): Seq[FileStatus] = { val (directories, leaves) = fs.listStatus(status.getPath).partition(_.isDirectory) leaves ++ directories.flatMap(f => listLeafStatuses(fs, f)) } if (baseStatus.isDirectory) recurse(baseStatus) else Seq(baseStatus) } def listLeafDirStatuses(fs: FileSystem, basePath: Path): Seq[FileStatus] = { listLeafDirStatuses(fs, fs.getFileStatus(basePath)) } def listLeafDirStatuses(fs: FileSystem, baseStatus: FileStatus): Seq[FileStatus] = { def recurse(status: FileStatus): Seq[FileStatus] = { val (directories, files) = fs.listStatus(status.getPath).partition(_.isDirectory) val leaves = if (directories.isEmpty) Seq(status) else Seq.empty[FileStatus] leaves ++ directories.flatMap(dir => listLeafDirStatuses(fs, dir)) } assert(baseStatus.isDirectory) recurse(baseStatus) } def isGlobPath(pattern: Path): Boolean = { pattern.toString.exists("{}[]*?\\".toSet.contains) } def globPath(pattern: Path): Seq[Path] = { val fs = pattern.getFileSystem(conf) globPath(fs, pattern) } def globPath(fs: FileSystem, pattern: Path): Seq[Path] = { Option(fs.globStatus(pattern)).map { statuses => statuses.map(_.getPath.makeQualified(fs.getUri, fs.getWorkingDirectory)).toSeq }.getOrElse(Seq.empty[Path]) } def globPathIfNecessary(pattern: Path): Seq[Path] = { if (isGlobPath(pattern)) globPath(pattern) else Seq(pattern) } def globPathIfNecessary(fs: FileSystem, pattern: Path): Seq[Path] = { if (isGlobPath(pattern)) globPath(fs, pattern) else Seq(pattern) } /** * Lists all the files in a directory with the specified prefix, and does not end with the * given suffix. The returned {{FileStatus}} instances are sorted by the modification times of * the respective files. */ def listFilesSorted( remoteFs: FileSystem, dir: Path, prefix: String, exclusionSuffix: String): Array[FileStatus] = { try { val fileStatuses = remoteFs.listStatus(dir, new PathFilter { override def accept(path: Path): Boolean = { val name = path.getName name.startsWith(prefix) && !name.endsWith(exclusionSuffix) } }) Arrays.sort(fileStatuses, (o1: FileStatus, o2: FileStatus) => Longs.compare(o1.getModificationTime, o2.getModificationTime)) fileStatuses } catch { case NonFatal(e) => logWarning("Error while attempting to list files from application staging dir", e) Array.empty } } private[spark] def getSuffixForCredentialsPath(credentialsPath: Path): Int = { val fileName = credentialsPath.getName fileName.substring( fileName.lastIndexOf(SparkHadoopUtil.SPARK_YARN_CREDS_COUNTER_DELIM) + 1).toInt } private val HADOOP_CONF_PATTERN = "(\\$\\{hadoopconf-[^\\}\\$\\s]+\\})".r.unanchored /** * Substitute variables by looking them up in Hadoop configs. Only variables that match the * ${hadoopconf- .. } pattern are substituted. */ def substituteHadoopVariables(text: String, hadoopConf: Configuration): String = { text match { case HADOOP_CONF_PATTERN(matched) => logDebug(text + " matched " + HADOOP_CONF_PATTERN) val key = matched.substring(13, matched.length() - 1) // remove ${hadoopconf- .. } val eval = Option[String](hadoopConf.get(key)) .map { value => logDebug("Substituted " + matched + " with " + value) text.replace(matched, value) } if (eval.isEmpty) { // The variable was not found in Hadoop configs, so return text as is. text } else { // Continue to substitute more variables. substituteHadoopVariables(eval.get, hadoopConf) } case _ => logDebug(text + " didn't match " + HADOOP_CONF_PATTERN) text } } /** * Dump the credentials' tokens to string values. * * @param credentials credentials * @return an iterator over the string values. If no credentials are passed in: an empty list */ private[spark] def dumpTokens(credentials: Credentials): Iterable[String] = { if (credentials != null) { credentials.getAllTokens.asScala.map(tokenToString) } else { Seq.empty } } /** * Convert a token to a string for logging. * If its an abstract delegation token, attempt to unmarshall it and then * print more details, including timestamps in human-readable form. * * @param token token to convert to a string * @return a printable string value. */ private[spark] def tokenToString(token: Token[_ <: TokenIdentifier]): String = { val df = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.SHORT, Locale.US) val buffer = new StringBuilder(128) buffer.append(token.toString) try { val ti = token.decodeIdentifier buffer.append("; ").append(ti) ti match { case dt: AbstractDelegationTokenIdentifier => // include human times and the renewer, which the HDFS tokens toString omits buffer.append("; Renewer: ").append(dt.getRenewer) buffer.append("; Issued: ").append(df.format(new Date(dt.getIssueDate))) buffer.append("; Max Date: ").append(df.format(new Date(dt.getMaxDate))) case _ => } } catch { case e: IOException => logDebug(s"Failed to decode $token: $e", e) } buffer.toString } def serialize(creds: Credentials): Array[Byte] = { val byteStream = new ByteArrayOutputStream val dataStream = new DataOutputStream(byteStream) creds.writeTokenStorageToStream(dataStream) byteStream.toByteArray } def deserialize(tokenBytes: Array[Byte]): Credentials = { val tokensBuf = new ByteArrayInputStream(tokenBytes) val creds = new Credentials() creds.readTokenStorageStream(new DataInputStream(tokensBuf)) creds } def isProxyUser(ugi: UserGroupInformation): Boolean = { ugi.getAuthenticationMethod() == UserGroupInformation.AuthenticationMethod.PROXY } } object SparkHadoopUtil extends Logging { private lazy val instance = new SparkHadoopUtil val SPARK_YARN_CREDS_TEMP_EXTENSION = ".tmp" val SPARK_YARN_CREDS_COUNTER_DELIM = "-" /** * Number of records to update input metrics when reading from HadoopRDDs. * * Each update is potentially expensive because we need to use reflection to access the * Hadoop FileSystem API of interest (only available in 2.5), so we should do this sparingly. */ private[spark] val UPDATE_INPUT_METRICS_INTERVAL_RECORDS = 1000 /** * Name of the file containing the gateway's Hadoop configuration, to be overlayed on top of the * cluster's Hadoop config. It is up to the Spark code launching the application to create * this file if it's desired. If the file doesn't exist, it will just be ignored. */ private[spark] val SPARK_HADOOP_CONF_FILE = "__spark_hadoop_conf__.xml" def get: SparkHadoopUtil = instance /** * Returns a Configuration object with Spark configuration applied on top. Unlike * the instance method, this will always return a Configuration instance, and not a * cluster manager-specific type. */ private[spark] def newConfiguration(conf: SparkConf): Configuration = { val hadoopConf = new Configuration() appendS3AndSparkHadoopHiveConfigurations(conf, hadoopConf) hadoopConf } private def appendS3AndSparkHadoopHiveConfigurations( conf: SparkConf, hadoopConf: Configuration): Unit = { // Note: this null check is around more than just access to the "conf" object to maintain // the behavior of the old implementation of this code, for backwards compatibility. if (conf != null) { // Explicitly check for S3 environment variables val keyId = System.getenv("AWS_ACCESS_KEY_ID") val accessKey = System.getenv("AWS_SECRET_ACCESS_KEY") if (keyId != null && accessKey != null) { hadoopConf.set("fs.s3.awsAccessKeyId", keyId) hadoopConf.set("fs.s3n.awsAccessKeyId", keyId) hadoopConf.set("fs.s3a.access.key", keyId) hadoopConf.set("fs.s3.awsSecretAccessKey", accessKey) hadoopConf.set("fs.s3n.awsSecretAccessKey", accessKey) hadoopConf.set("fs.s3a.secret.key", accessKey) val sessionToken = System.getenv("AWS_SESSION_TOKEN") if (sessionToken != null) { hadoopConf.set("fs.s3a.session.token", sessionToken) } } appendHiveConfigs(hadoopConf) appendSparkHadoopConfigs(conf, hadoopConf) appendSparkHiveConfigs(conf, hadoopConf) val bufferSize = conf.get(BUFFER_SIZE).toString hadoopConf.set("io.file.buffer.size", bufferSize) } } private lazy val hiveConfKeys = { val configFile = Utils.getContextOrSparkClassLoader.getResource("hive-site.xml") if (configFile != null) { val conf = new Configuration(false) conf.addResource(configFile) conf.iterator().asScala.toSeq } else { Nil } } private def appendHiveConfigs(hadoopConf: Configuration): Unit = { hiveConfKeys.foreach { kv => hadoopConf.set(kv.getKey, kv.getValue) } } private def appendSparkHadoopConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = { // Copy any "spark.hadoop.foo=bar" spark properties into conf as "foo=bar" for ((key, value) <- conf.getAll if key.startsWith("spark.hadoop.")) { hadoopConf.set(key.substring("spark.hadoop.".length), value) } if (conf.getOption("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version").isEmpty) { hadoopConf.set("mapreduce.fileoutputcommitter.algorithm.version", "1") } } private def appendSparkHiveConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = { // Copy any "spark.hive.foo=bar" spark properties into conf as "hive.foo=bar" for ((key, value) <- conf.getAll if key.startsWith("spark.hive.")) { hadoopConf.set(key.substring("spark.".length), value) } } // scalastyle:off line.size.limit /** * Create a file on the given file system, optionally making sure erasure coding is disabled. * * Disabling EC can be helpful as HDFS EC doesn't support hflush(), hsync(), or append(). * Disabling EC can be helpful as HDFS EC doesn't support hflush(), hsync(), or append(). * https://hadoop.apache.org/docs/r3.0.0/hadoop-project-dist/hadoop-hdfs/HDFSErasureCoding.html#Limitations */ // scalastyle:on line.size.limit def createFile(fs: FileSystem, path: Path, allowEC: Boolean): FSDataOutputStream = { if (allowEC) { fs.create(path) } else { try { // Use reflection as this uses APIs only available in Hadoop 3 val builderMethod = fs.getClass().getMethod("createFile", classOf[Path]) // the builder api does not resolve relative paths, nor does it create parent dirs, while // the old api does. if (!fs.mkdirs(path.getParent())) { throw new IOException(s"Failed to create parents of $path") } val qualifiedPath = fs.makeQualified(path) val builder = builderMethod.invoke(fs, qualifiedPath) val builderCls = builder.getClass() // this may throw a NoSuchMethodException if the path is not on hdfs val replicateMethod = builderCls.getMethod("replicate") val buildMethod = builderCls.getMethod("build") val b2 = replicateMethod.invoke(builder) buildMethod.invoke(b2).asInstanceOf[FSDataOutputStream] } catch { case _: NoSuchMethodException => // No createFile() method, we're using an older hdfs client, which doesn't give us control // over EC vs. replication. Older hdfs doesn't have EC anyway, so just create a file with // old apis. fs.create(path) } } } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/pom.xml ================================================ 4.0.0 fire-connector-spark-rocketmq_${spark.reference} jar Fire : Connectors : Spark : RocketMQ com.zto.fire fire-spark-connectors 2.3.2-SNAPSHOT ../pom.xml org.apache.spark spark-core_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.spark spark-streaming_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.spark spark-sql_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.rocketmq rocketmq-client ${rocketmq.version} org.apache.rocketmq rocketmq-common ${rocketmq.version} io.netty netty-tcnative commons-lang commons-lang ${commons-lang.version} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/java/org/apache/rocketmq/spark/OffsetCommitCallback.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.spark; import org.apache.rocketmq.common.message.MessageQueue; import java.util.Map; /** * A callback interface that the user can implement to trigger custom actions when a commit request completes. */ public interface OffsetCommitCallback { /** * A callback method the user can implement to provide asynchronous handling of commit request completion. * This method will be called by InputDstream when the last batch is handled successfully. * @param offsets the offsets which already are handled successfully * @param exception The exception thrown during processing of the request, or null if the commit completed successfully */ void onComplete(Map offsets, Exception exception); } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/java/org/apache/rocketmq/spark/RocketMQConfig.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.spark; import org.apache.commons.lang.Validate; import org.apache.rocketmq.client.ClientConfig; import org.apache.rocketmq.client.consumer.DefaultMQPushConsumer; import org.apache.rocketmq.client.exception.MQClientException; import org.apache.rocketmq.common.consumer.ConsumeFromWhere; import org.apache.rocketmq.remoting.common.RemotingUtil; import java.util.Properties; import java.util.UUID; /** * RocketMQConfig for Consumer */ public class RocketMQConfig { // ------- the following is for common usage ------- /** * RocketMq name server address */ public static final String NAME_SERVER_ADDR = "nameserver.addr"; // Required public static final String CLIENT_NAME = "client.name"; public static final String CLIENT_IP = "client.ip"; public static final String DEFAULT_CLIENT_IP = RemotingUtil.getLocalAddress(); public static final String CLIENT_CALLBACK_EXECUTOR_THREADS = "client.callback.executor.threads"; public static final int DEFAULT_CLIENT_CALLBACK_EXECUTOR_THREADS = Runtime.getRuntime().availableProcessors();; public static final String NAME_SERVER_POLL_INTERVAL = "nameserver.poll.interval"; public static final int DEFAULT_NAME_SERVER_POLL_INTERVAL = 30000; // 30 seconds public static final String BROKER_HEART_BEAT_INTERVAL = "brokerserver.heartbeat.interval"; public static final int DEFAULT_BROKER_HEART_BEAT_INTERVAL = 30000; // 30 seconds // ------- the following is for push consumer mode ------- /** * RocketMq consumer group */ public static final String CONSUMER_GROUP = "consumer.group"; // Required /** * RocketMq consumer topic */ public static final String CONSUMER_TOPIC = "consumer.topic"; // Required public static final String CONSUMER_TAG = "consumer.tag"; public static final String DEFAULT_TAG = "*"; public static final String CONSUMER_OFFSET_RESET_TO = "consumer.offset.reset.to"; public static final String CONSUMER_OFFSET_LATEST = "latest"; public static final String CONSUMER_OFFSET_EARLIEST = "earliest"; public static final String CONSUMER_OFFSET_TIMESTAMP = "timestamp"; public static final String CONSUMER_MESSAGES_ORDERLY = "consumer.messages.orderly"; public static final String CONSUMER_OFFSET_PERSIST_INTERVAL = "consumer.offset.persist.interval"; public static final int DEFAULT_CONSUMER_OFFSET_PERSIST_INTERVAL = 5000; // 5 seconds public static final String CONSUMER_MIN_THREADS = "consumer.min.threads"; public static final int DEFAULT_CONSUMER_MIN_THREADS = 20; public static final String CONSUMER_MAX_THREADS = "consumer.max.threads"; public static final int DEFAULT_CONSUMER_MAX_THREADS = 64; // ------- the following is for reliable Receiver ------- public static final String QUEUE_SIZE = "spout.queue.size"; public static final int DEFAULT_QUEUE_SIZE = 500; public static final String MESSAGES_MAX_RETRY = "spout.messages.max.retry"; public static final int DEFAULT_MESSAGES_MAX_RETRY = 3; public static final String MESSAGES_TTL = "spout.messages.ttl"; public static final int DEFAULT_MESSAGES_TTL = 300000; // 5min // ------- the following is for pull consumer mode ------- /** * Maximum rate (number of records per second) at which data will be read from each RocketMq partition , * and the default value is "-1", it means consumer can pull message from rocketmq as fast as the consumer can. * Other that, you also enables or disables Spark Streaming's internal backpressure mechanism by the config * "spark.streaming.backpressure.enabled". */ public static final String MAX_PULL_SPEED_PER_PARTITION = "pull.max.speed.per.partition"; /** * To pick up the consume speed, the consumer can pull a batch of messages at a time. And the default * value is "32" */ public static final String PULL_MAX_BATCH_SIZE = "pull.max.batch.size"; /** * pull timeout for the consumer, and the default time is "3000". */ public static final String PULL_TIMEOUT_MS = "pull.timeout.ms"; // the following configs for consumer cache public static final String PULL_CONSUMER_CACHE_INIT_CAPACITY = "pull.consumer.cache.initialCapacity"; public static final String PULL_CONSUMER_CACHE_MAX_CAPACITY = "pull.consumer.cache.maxCapacity"; public static final String PULL_CONSUMER_CACHE_LOAD_FACTOR = "pull.consumer.cache.loadFactor"; public static void buildConsumerConfigs(Properties props, DefaultMQPushConsumer consumer) { buildCommonConfigs(props, consumer); String group = props.getProperty(CONSUMER_GROUP); Validate.notEmpty(group); consumer.setConsumerGroup(group); consumer.setPersistConsumerOffsetInterval(getInteger(props, CONSUMER_OFFSET_PERSIST_INTERVAL, DEFAULT_CONSUMER_OFFSET_PERSIST_INTERVAL)); consumer.setConsumeThreadMin(getInteger(props, CONSUMER_MIN_THREADS, DEFAULT_CONSUMER_MIN_THREADS)); consumer.setConsumeThreadMax(getInteger(props, CONSUMER_MAX_THREADS, DEFAULT_CONSUMER_MAX_THREADS)); String initOffset = props.getProperty(CONSUMER_OFFSET_RESET_TO, CONSUMER_OFFSET_LATEST); switch (initOffset) { case CONSUMER_OFFSET_EARLIEST: consumer.setConsumeFromWhere(ConsumeFromWhere.CONSUME_FROM_FIRST_OFFSET); break; case CONSUMER_OFFSET_LATEST: consumer.setConsumeFromWhere(ConsumeFromWhere.CONSUME_FROM_LAST_OFFSET); break; case CONSUMER_OFFSET_TIMESTAMP: consumer.setConsumeTimestamp(initOffset); break; default: consumer.setConsumeFromWhere(ConsumeFromWhere.CONSUME_FROM_LAST_OFFSET); } String topic = props.getProperty(CONSUMER_TOPIC); Validate.notEmpty(topic); try { consumer.subscribe(topic, props.getProperty(CONSUMER_TAG, DEFAULT_TAG)); } catch (MQClientException e) { throw new IllegalArgumentException(e); } } public static void buildCommonConfigs(Properties props, ClientConfig client) { String namesvr = props.getProperty(NAME_SERVER_ADDR); Validate.notEmpty(namesvr); client.setNamesrvAddr(namesvr); client.setClientIP(props.getProperty(CLIENT_IP, DEFAULT_CLIENT_IP)); // use UUID for client name by default String defaultClientName = UUID.randomUUID().toString(); client.setInstanceName(props.getProperty(CLIENT_NAME, defaultClientName)); client.setClientCallbackExecutorThreads(getInteger(props, CLIENT_CALLBACK_EXECUTOR_THREADS, DEFAULT_CLIENT_CALLBACK_EXECUTOR_THREADS)); client.setPollNameServerInterval(getInteger(props, NAME_SERVER_POLL_INTERVAL, DEFAULT_NAME_SERVER_POLL_INTERVAL)); client.setHeartbeatBrokerInterval(getInteger(props, BROKER_HEART_BEAT_INTERVAL, DEFAULT_BROKER_HEART_BEAT_INTERVAL)); } public static int getInteger(Properties props, String key, int defaultValue) { return Integer.parseInt(props.getProperty(key, String.valueOf(defaultValue))); } public static boolean getBoolean(Properties props, String key, boolean defaultValue) { return Boolean.parseBoolean(props.getProperty(key, String.valueOf(defaultValue))); } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/java/org/apache/rocketmq/spark/TopicQueueId.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.spark; import java.io.Serializable; public final class TopicQueueId implements Serializable { private int hash = 0; private final int queueId; private final String topic; public TopicQueueId(String topic, int queueId) { this.queueId = queueId; this.topic = topic; } public int queueId() { return queueId; } public String topic() { return topic; } @Override public int hashCode() { if (hash != 0) { return hash; } final int prime = 31; int result = 1; result = prime * result + queueId; result = prime * result + ((topic == null) ? 0 : topic.hashCode()); this.hash = result; return result; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } TopicQueueId other = (TopicQueueId) obj; if (queueId != other.queueId) { return false; } if (topic == null) { if (other.topic != null) { return false; } } else { if (!topic.equals(other.topic)) { return false; } } return true; } @Override protected TopicQueueId clone() throws CloneNotSupportedException { return new TopicQueueId(this.topic, this.queueId); } @Override public String toString() { return topic + "-" + queueId; } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/java/org/apache/rocketmq/spark/streaming/DefaultMessageRetryManager.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.spark.streaming; import java.util.Map; import java.util.Timer; import java.util.TimerTask; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; /** * An implementation of MessageRetryManager */ public class DefaultMessageRetryManager implements MessageRetryManager{ private Map cache = new ConcurrentHashMap<>(500); private BlockingQueue queue; private int maxRetry; private int ttl; public DefaultMessageRetryManager(BlockingQueue queue, final int maxRetry, final int ttl) { this.queue = queue; this.maxRetry = maxRetry; this.ttl = ttl; long period = 5000; new Timer().scheduleAtFixedRate(new TimerTask() { @Override public void run() { long now = System.currentTimeMillis(); for (Map.Entry entry : cache.entrySet()) { String id = entry.getKey(); MessageSet messageSet = entry.getValue(); if (now - messageSet.getTimestamp() >= ttl) { // no ack/fail received in ttl fail(id); } } } }, period, period); } @Override public void ack(String id) { cache.remove(id); } @Override public void fail(String id) { MessageSet messageSet = cache.remove(id); if (messageSet == null) { return; } if (needRetry(messageSet)) { messageSet.setRetries(messageSet.getRetries() + 1); messageSet.setTimestamp(0); try { queue.put(messageSet); } catch (InterruptedException e) { // no op } } } @Override public void mark(MessageSet messageSet) { messageSet.setTimestamp(System.currentTimeMillis()); cache.put(messageSet.getId(), messageSet); } @Override public boolean needRetry(MessageSet messageSet) { return messageSet.getRetries() < maxRetry; } // just for testing public void setCache(Map cache) { this.cache = cache; } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/java/org/apache/rocketmq/spark/streaming/MessageRetryManager.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.spark.streaming; /** * Interface for messages retry manager */ public interface MessageRetryManager { /** * message with the id is success * @param id */ void ack(String id); /** * message with the id is failure * @param id */ void fail(String id); /** * Mark the messageSet * @param messageSet */ void mark(MessageSet messageSet); /** * Is the messageSet need retry * @param messageSet * @return */ boolean needRetry(MessageSet messageSet); } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/java/org/apache/rocketmq/spark/streaming/MessageSet.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.spark.streaming; import org.apache.rocketmq.common.message.Message; import org.apache.rocketmq.common.message.MessageExt; import java.io.Serializable; import java.util.Iterator; import java.util.List; import java.util.UUID; /** * A message collection. */ public class MessageSet implements Iterator, Serializable{ private final String id; private final List data; private final Iterator iterator; private long timestamp; private int retries; public MessageSet(String id, List data) { this.id = id; this.data = data; this.iterator = data.iterator(); } public MessageSet(List data) { this(UUID.randomUUID().toString(), data); } public String getId() { return id; } public List getData() { return data; } public long getTimestamp() { return timestamp; } public void setTimestamp(long timestamp) { this.timestamp = timestamp; } public int getRetries() { return retries; } public void setRetries(int retries) { this.retries = retries; } @Override public boolean hasNext() { return iterator.hasNext(); } @Override public Message next() { return iterator.next(); } @Override public void remove() { iterator.remove(); } @Override public String toString() { return data.toString(); } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/java/org/apache/rocketmq/spark/streaming/ReliableRocketMQReceiver.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.spark.streaming; import org.apache.rocketmq.common.message.MessageExt; import org.apache.rocketmq.spark.RocketMQConfig; import org.apache.spark.storage.StorageLevel; import java.util.List; import java.util.Properties; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; /** * The ReliableRocketMQReceiver is fault-tolerance guarantees */ public class ReliableRocketMQReceiver extends RocketMQReceiver { private BlockingQueue queue; private MessageRetryManager messageRetryManager; private MessageSender sender; public ReliableRocketMQReceiver(Properties properties, StorageLevel storageLevel) { super(properties, storageLevel); } @Override public void onStart() { int queueSize = RocketMQConfig.getInteger(properties, RocketMQConfig.QUEUE_SIZE, RocketMQConfig.DEFAULT_QUEUE_SIZE); queue = new LinkedBlockingQueue<>(queueSize); int maxRetry = RocketMQConfig.getInteger(properties, RocketMQConfig.MESSAGES_MAX_RETRY, RocketMQConfig.DEFAULT_MESSAGES_MAX_RETRY); int ttl = RocketMQConfig.getInteger(properties, RocketMQConfig.MESSAGES_TTL, RocketMQConfig.DEFAULT_MESSAGES_TTL); this.messageRetryManager = new DefaultMessageRetryManager(queue, maxRetry, ttl); this.sender = new MessageSender(); this.sender.setName("MessageSender"); this.sender.setDaemon(true); this.sender.start(); super.onStart(); } @Override public boolean process(List msgs) { if (msgs.isEmpty()) { return true; } MessageSet messageSet = new MessageSet(msgs); try { queue.put(messageSet); return true; } catch (InterruptedException e) { return false; } } public void ack(Object msgId) { String id = msgId.toString(); messageRetryManager.ack(id); } public void fail(Object msgId) { String id = msgId.toString(); messageRetryManager.fail(id); } @Override public void onStop() { consumer.shutdown(); } class MessageSender extends Thread { @Override public void run() { while (ReliableRocketMQReceiver.this.isStarted()) { MessageSet messageSet = null; try { messageSet = queue.take(); } catch (InterruptedException e) { continue; } if (messageSet == null) { continue; } messageRetryManager.mark(messageSet); try { // To implement a reliable receiver, you have to use store(multiple-records) to store data ReliableRocketMQReceiver.this.store(messageSet); ack(messageSet.getId()); } catch (Exception e) { fail(messageSet.getId()); } } } } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/java/org/apache/rocketmq/spark/streaming/RocketMQReceiver.java ================================================ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.spark.streaming; import org.apache.commons.lang.Validate; import org.apache.rocketmq.client.consumer.DefaultMQPushConsumer; import org.apache.rocketmq.client.consumer.MQPushConsumer; import org.apache.rocketmq.client.consumer.listener.*; import org.apache.rocketmq.client.exception.MQClientException; import org.apache.rocketmq.common.message.Message; import org.apache.rocketmq.common.message.MessageExt; import org.apache.rocketmq.spark.RocketMQConfig; import org.apache.spark.storage.StorageLevel; import org.apache.spark.streaming.receiver.Receiver; import java.util.List; import java.util.Properties; /** * RocketMQReceiver uses MQPushConsumer as the default implementation. * PushConsumer is a high level consumer API, wrapping the pulling details * Looks like broker push messages to consumer * * NOTE: This is no fault-tolerance guarantees, can lose data on receiver failure. * Recommend to use ReliableRocketMQReceiver which is fault-tolerance guarantees. */ public class RocketMQReceiver extends Receiver { protected MQPushConsumer consumer; protected boolean ordered; protected Properties properties; public RocketMQReceiver(Properties properties, StorageLevel storageLevel) { super(storageLevel); this.properties = properties; } @Override public void onStart() { Validate.notEmpty(properties, "Consumer properties can not be empty"); ordered = RocketMQConfig.getBoolean(properties, RocketMQConfig.CONSUMER_MESSAGES_ORDERLY, false); consumer = new DefaultMQPushConsumer(); RocketMQConfig.buildConsumerConfigs(properties, (DefaultMQPushConsumer)consumer); if (ordered) { consumer.registerMessageListener(new MessageListenerOrderly() { @Override public ConsumeOrderlyStatus consumeMessage(List msgs, ConsumeOrderlyContext context) { if (process(msgs)) { return ConsumeOrderlyStatus.SUCCESS; } else { return ConsumeOrderlyStatus.SUSPEND_CURRENT_QUEUE_A_MOMENT; } } }); } else { consumer.registerMessageListener(new MessageListenerConcurrently() { @Override public ConsumeConcurrentlyStatus consumeMessage(List msgs, ConsumeConcurrentlyContext context) { if (process(msgs)) { return ConsumeConcurrentlyStatus.CONSUME_SUCCESS; } else { return ConsumeConcurrentlyStatus.RECONSUME_LATER; } } }); } try { consumer.start(); } catch (MQClientException e) { throw new RuntimeException(e); } } public boolean process(List msgs) { if (msgs.isEmpty()) { return true; } try { for (MessageExt msg : msgs) { this.store(msg); } return true; } catch (Exception e) { return false; } } @Override public void onStop() { consumer.shutdown(); } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/rocketmq/spark/CachedMQConsumer.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.spark import org.apache.rocketmq.client.consumer.{DefaultMQPullConsumer, PullStatus} import org.apache.rocketmq.common.message.{MessageExt, MessageQueue} import java.{util => ju} /** * Consumer of single topic partition, intended for cached reuse. */ private[rocketmq] class CachedMQConsumer private( val groupId: String, val client: DefaultMQPullConsumer, val topic: String, val queueId: Int, val names: Set[String], val optionParams: ju.Map[String, String]) extends Logging { private val maxBatchSize = optionParams.getOrDefault(RocketMQConfig.PULL_MAX_BATCH_SIZE, "32").toInt private var buffer = names.map(name => name -> ju.Collections.emptyList[MessageExt].iterator).toMap private var nextOffsets = names.map(name => name -> -2L).toMap /** * Get the record for the given offset, waiting up to timeout ms if IO is necessary. * Sequential forward access will use buffers, but random access will be horribly inefficient. */ def get(name: String, queueOffset: Long): MessageExt = { val nextOffset = nextOffsets(name) logDebug(s"Get $groupId $topic $queueId brokerName $name nextOffset $nextOffset requested") if (queueOffset != nextOffset) { logInfo(s"Initial fetch for $groupId $topic $name $queueOffset") poll(name, queueOffset) } if (!buffer(name).hasNext) { poll(name, queueOffset) } val iter = buffer(name) if(iter.hasNext) { val record = iter.next assert(record.getQueueOffset == queueOffset, s"Got wrong record for $groupId $topic $queueId $name even after seeking to offset $queueOffset") nextOffsets += (name -> (queueOffset + 1)) record } else { throw new IllegalStateException(s"Failed to get records for $groupId $topic $queueId $name $queueOffset after polling ") } } private def poll(name: String, queueOffset: Long) { var p = client.pull(new MessageQueue(topic, name, queueId), "*", queueOffset, maxBatchSize) var i = 0 while (p.getPullStatus != PullStatus.FOUND){ // it maybe not get the message, so we will retry Thread.sleep(100) logError(s"Polled failed for $queueId $name $queueOffset $maxBatchSize ${p.toString}") i = i + 1 p = client.pull(new MessageQueue(topic, name, queueId), "*", queueOffset, maxBatchSize) if (i > 10){ throw new IllegalStateException(s"Failed to get records for $groupId $topic $queueId $name $queueOffset after polling," + s"due to ${p.toString}") } } buffer += (name -> p.getMsgFoundList.iterator) } } object CachedMQConsumer extends Logging { private case class CacheKey(groupId: String, topic: String, queueId: Int, names: Set[String]) private var groupIdToClient = Map[String, DefaultMQPullConsumer]() // Don't want to depend on guava, don't want a cleanup thread, use a simple LinkedHashMap private var cache: ju.LinkedHashMap[CacheKey, CachedMQConsumer] = null /** Must be called before get, once per JVM, to configure the cache. Further calls are ignored */ def init( initialCapacity: Int, maxCapacity: Int, loadFactor: Float): Unit = CachedMQConsumer.synchronized { if (null == cache) { logInfo(s"Initializing cache $initialCapacity $maxCapacity $loadFactor") cache = new ju.LinkedHashMap[CacheKey, CachedMQConsumer]( initialCapacity, loadFactor, true) { override def removeEldestEntry( entry: ju.Map.Entry[CacheKey, CachedMQConsumer]): Boolean = { if (this.size > maxCapacity) { true } else { false } } } } } /** * Get a cached consumer for groupId, assigned to topic, queueId and names. * If matching consumer doesn't already exist, will be created using optionParams. */ def getOrCreate( groupId: String, topic: String, queueId: Int, names: Set[String], optionParams: ju.Map[String, String]): CachedMQConsumer = CachedMQConsumer.synchronized { val client = if (!groupIdToClient.contains(groupId)){ val client = RocketMqUtils.mkPullConsumerInstance(groupId, optionParams, s"$groupId-executor") groupIdToClient += groupId -> client client } else { groupIdToClient(groupId) } val k = CacheKey(groupId, topic, queueId, names) if (cache.containsValue(k)) { cache.get(k) } else { logInfo(s"Cache miss for $k") logDebug(cache.keySet.toString) val c= new CachedMQConsumer(groupId, client, topic, queueId, names, optionParams) cache.put(k, c) c } } /** * Get a fresh new instance, unassociated with the global cache. * Caller is responsible for closing */ def getUncached( groupId: String, topic: String, queueId: Int, names: Set[String], optionParams: ju.Map[String, String]): CachedMQConsumer = { val client = RocketMqUtils.mkPullConsumerInstance(groupId, optionParams, s"$groupId-executor-$queueId-${names.mkString("-")}") new CachedMQConsumer(groupId, client, topic, queueId, names, optionParams) } /** remove consumer for given groupId, topic, and queueId, if it exists */ def remove(groupId: String, topic: String, queueId: Int, names: Set[String]): Unit = { val k = CacheKey(groupId, topic, queueId, names) logInfo(s"Removing $k from cache") val v = CachedMQConsumer.synchronized { cache.remove(k) } } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/rocketmq/spark/ConsumerStrategy.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.spark import java.{util => ju} import org.apache.rocketmq.common.UtilAll import org.apache.rocketmq.common.message.MessageQueue import org.apache.spark.streaming.MQPullInputDStream import scala.collection.JavaConverters._ /** * Specify the start available offset for the rocketmq consumer */ sealed abstract class ConsumerStrategy /** * Specify the earliest available offset for the rocketmq consumer to start consuming.. * But if the rocketmq server has checkpoint for the [[MessageQueue]], then the consumer will consume from * the checkpoint. */ case object EarliestStrategy extends ConsumerStrategy /** * Specify the lastest available offset for the rocketmq consumer to start consuming. * But if the rocketmq server has checkpoint for the [[MessageQueue]], then the consumer will consume from * the checkpoint. */ case object LatestStrategy extends ConsumerStrategy /** * Specify the specific available offset for the rocketmq consumer to start consuming. * Generally if the rocketmq server has checkpoint for the [[MessageQueue]], then the consumer will consume from * the checkpoint. But if the [[MQPullInputDStream.forceSpecial]] is true, the rocketmq will start consuming from * the specific available offset in any case. Of course, the consumer will use the min available offset if a message * queue is not specified. */ case class SpecificOffsetStrategy( queueToOffset: Map[MessageQueue, Long]) extends ConsumerStrategy object ConsumerStrategy { /** * Used to denote offset range limits that are resolved via rocketmq */ val LATEST = -1L // indicates resolution to the latest offset val EARLIEST = -2L // indicates resolution to the earliest offset def earliest: ConsumerStrategy = org.apache.rocketmq.spark.EarliestStrategy def lastest: ConsumerStrategy = org.apache.rocketmq.spark.LatestStrategy def specificOffset(queueToOffset: ju.Map[MessageQueue, Long]): ConsumerStrategy = { val scalaMapOffset = queueToOffset.asScala.map{ case (q, o) => (q, o) }.toMap SpecificOffsetStrategy(scalaMapOffset) } def specificTime(queueToTime: ju.Map[MessageQueue, String]): ConsumerStrategy = { val queueToOffset = queueToTime.asScala.map{ case (q, t) => val offset = UtilAll.parseDate(t, UtilAll.YYYY_MM_DD_HH_MM_SS).getTime (q, offset) }.toMap SpecificOffsetStrategy(queueToOffset) } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/rocketmq/spark/LocationStrategy.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.spark import java.{util => ju} import scala.collection.JavaConverters._ /** * :: Experimental :: * Choice of how to schedule consumers for a given [[TopicQueueId]] on an executor. * See [[LocationStrategy]] to obtain instances. * RocketMq consumers prefetch messages, so it's important for performance * to keep cached consumers on appropriate executors, not recreate them for every partition. * Choice of location is only a preference, not an absolute; partitions may be scheduled elsewhere. */ sealed abstract class LocationStrategy case object PreferConsistent extends LocationStrategy case class PreferFixed(hostMap: ju.Map[TopicQueueId, String]) extends LocationStrategy /** * object to obtain instances of [[LocationStrategy]] * */ object LocationStrategy { /** * * Use this in most cases, it will consistently distribute partitions across all executors. */ def PreferConsistent: LocationStrategy = org.apache.rocketmq.spark.PreferConsistent /** * Use this to place particular TopicQueueIds on particular hosts if your load is uneven. * Any TopicQueueId not specified in the map will use a consistent location. */ def PreferFixed(hostMap: collection.Map[TopicQueueId, String]): LocationStrategy = new PreferFixed(new ju.HashMap[TopicQueueId, String](hostMap.asJava)) /** * Use this to place particular TopicQueueIds on particular hosts if your load is uneven. * Any TopicQueueId not specified in the map will use a consistent location. */ def PreferFixed(hostMap: ju.Map[TopicQueueId, String]): LocationStrategy = new PreferFixed(hostMap) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/rocketmq/spark/Logging.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.spark import org.slf4j.{Logger, LoggerFactory} /** * Utility trait for classes that want to log data. */ trait Logging { // Make the log field transient so that objects with Logging can // be serialized and used on another machine @transient private var log_ : Logger = null // Method to get or create the logger def log: Logger = { if (log_ == null) log_ = LoggerFactory.getLogger(this.getClass.getName.stripSuffix("$")) return log_ } // Log methods that take only a String def logInfo(msg: => String) = if (log.isInfoEnabled) log.info(msg) def logDebug(msg: => String) = if (log.isDebugEnabled) log.debug(msg) def logWarning(msg: => String) = if (log.isWarnEnabled) log.warn(msg) def logError(msg: => String) = if (log.isErrorEnabled) log.error(msg) // Log methods that take Throwable (Exceptions/Errors) too def logInfo(msg: => String, throwable: Throwable) { if (log.isInfoEnabled) log.info(msg, throwable) } def logDebug(msg: => String, throwable: Throwable) { if (log.isDebugEnabled) log.debug(msg, throwable) } def logWarning(msg: => String, throwable: Throwable) = if (log.isWarnEnabled) log.warn(msg, throwable) def logError(msg: => String, throwable: Throwable) = if (log.isErrorEnabled) log.error(msg, throwable) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/rocketmq/spark/OffsetRange.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.spark import java.{util => ju} import org.apache.rocketmq.common.message.MessageQueue trait HasOffsetRanges { def offsetRanges: ju.Map[TopicQueueId, Array[OffsetRange]] } trait CanCommitOffsets { /** * Queue up offset ranges for commit to rocketmq at a future time. Threadsafe. * This is only needed if you intend to store offsets in rocketmq, instead of your own store. * @param offsetRanges The maximum untilOffset for a given partition will be used at commit. */ def commitAsync(offsetRanges: ju.Map[TopicQueueId, Array[OffsetRange]]): Unit /** * Queue up offset ranges for commit to rocketmq at a future time. Threadsafe. * This is only needed if you intend to store offsets in rocketmq, instead of your own store. * @param offsetRanges The maximum untilOffset for a given partition will be used at commit. * @param callback Only the most recently provided callback will be used at commit. */ def commitAsync(offsetRanges: ju.Map[TopicQueueId, Array[OffsetRange]], callback: OffsetCommitCallback): Unit } /** * Represents a range of offsets from a single rocketmq messageQueue. Instances of this class * can be created with `OffsetRange.create()`. * * @param topic topic name * @param queueId queueId id * @param brokerName the broker name * @param fromOffset Inclusive starting offset * @param untilOffset Exclusive ending offset */ final class OffsetRange private( val topic: String, val queueId: Int, val brokerName: String, val fromOffset: Long, val untilOffset: Long) extends Serializable { import OffsetRange.OffsetRangeTuple /** rocketmq topicMessageQueue object, for convenience */ def topicMessageQueue(): MessageQueue = new MessageQueue(topic, brokerName, queueId) /** Number of messages this OffsetRange refers to */ def count(): Long = { val ret = untilOffset - fromOffset assert(ret >= 0, s"OffsetRange happened errors form $topic $brokerName $fromOffset to $untilOffset") ret } override def equals(obj: Any): Boolean = obj match { case that: OffsetRange => this.topic == that.topic && this.queueId == that.queueId && this.brokerName == that.brokerName && this.fromOffset == that.fromOffset && this.untilOffset == that.untilOffset case _ => false } override def hashCode(): Int = { toTuple.hashCode() } override def toString(): String = { s"OffsetRange(topic: '$topic', queueId: $queueId, brokerName: $brokerName, range: [$fromOffset -> $untilOffset])" } /** this is to avoid ClassNotFoundException during checkpoint restore */ def toTuple: OffsetRangeTuple = (topic, queueId, brokerName, fromOffset, untilOffset) } /** * Companion object the provides methods to create instances of [[OffsetRange]]. */ object OffsetRange { def create(topic: String, queueId: Int, brokerName: String, fromOffset: Long, untilOffset: Long): OffsetRange = new OffsetRange(topic, queueId, brokerName, fromOffset, untilOffset) def create( topicMessageQueue: MessageQueue, fromOffset: Long, untilOffset: Long): OffsetRange = new OffsetRange(topicMessageQueue.getTopic, topicMessageQueue.getQueueId, topicMessageQueue.getBrokerName, fromOffset, untilOffset) def apply(topic: String, queueId: Int, brokerName: String, fromOffset: Long, untilOffset: Long): OffsetRange = new OffsetRange(topic, queueId, brokerName, fromOffset, untilOffset) def apply( topicMessageQueue: MessageQueue, fromOffset: Long, untilOffset: Long): OffsetRange = new OffsetRange(topicMessageQueue.getTopic, topicMessageQueue.getQueueId, topicMessageQueue.getBrokerName, fromOffset, untilOffset) /** this is to avoid ClassNotFoundException during checkpoint restore */ type OffsetRangeTuple = (String, Int, String, Long, Long) def apply(t: OffsetRangeTuple) = new OffsetRange(t._1, t._2, t._3, t._4, t._5) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/rocketmq/spark/RocketMqRDDPartition.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.spark import org.apache.spark.Partition /** * the Partition for RocketMqRDD * @param index the partition id for rdd * @param topic the rockermq topic * @param queueId the rocketmq queue id * @param partitionOffsetRanges Represents a range of offsets from a single partition * */ class RocketMqRDDPartition( val index: Int, val topic: String, val queueId: Int, val partitionOffsetRanges: Array[OffsetRange] ) extends Partition { /** Number of messages this partition refers to */ def count(): Long = { if (!partitionOffsetRanges.isEmpty) partitionOffsetRanges.map(_.count).sum else 0L } /** rocketmq TopicQueueId object, for convenience */ def topicQueueId(): TopicQueueId = new TopicQueueId(topic, queueId) def brokerNames(): Set[String] = { partitionOffsetRanges.map(_.brokerName).sorted.toSet } override def toString: String = { s"$index $topic $queueId ${partitionOffsetRanges.mkString(",")}" } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/rocketmq/spark/RocketMqUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.rocketmq.spark import java.util.Properties import java.{lang => jl, util => ju} import org.apache.commons.lang.StringUtils import org.apache.rocketmq.client.consumer.DefaultMQPullConsumer import org.apache.rocketmq.common.message.{Message, MessageExt, MessageQueue} import org.apache.rocketmq.spark.streaming.{ReliableRocketMQReceiver, RocketMQReceiver} import org.apache.spark.SparkContext import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.api.java.{JavaInputDStream, JavaStreamingContext} import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.{MQPullInputDStream, RocketMqRDD, StreamingContext} import org.slf4j.LoggerFactory object RocketMqUtils { private lazy val logger = LoggerFactory.getLogger(this.getClass) /** * Scala constructor for a batch-oriented interface for consuming from rocketmq. * Starting and ending offsets are specified in advance, * so that you can control exactly-once semantics. * @param sc SparkContext * @param groupId it is for rocketMq for identifying the consumer * @param offsetRanges offset ranges that define the RocketMq data belonging to this RDD * @param optionParams optional configs, see [[RocketMQConfig]] for more details. * @param locationStrategy map from TopicQueueId to preferred host for processing that partition. * In most cases, use [[LocationStrategy.PreferConsistent]] * @return RDD[MessageExt] */ def createRDD( sc: SparkContext, groupId: String, offsetRanges: ju.Map[TopicQueueId, Array[OffsetRange]], optionParams: ju.Map[String, String] = new ju.HashMap, locationStrategy: LocationStrategy = PreferConsistent ): RDD[MessageExt] = { val preferredHosts = locationStrategy match { case PreferConsistent => ju.Collections.emptyMap[TopicQueueId, String]() case PreferFixed(hostMap) => hostMap } new RocketMqRDD(sc, groupId, optionParams, offsetRanges, preferredHosts, false) } /** * Java constructor for a batch-oriented interface for consuming from rocketmq. * Starting and ending offsets are specified in advance, * so that you can control exactly-once semantics. * @param jsc SparkContext * @param groupId it is for rocketMq for identifying the consumer * @param offsetRanges offset ranges that define the RocketMq data belonging to this RDD * @param optionParams optional configs, see [[RocketMQConfig]] for more details. * @param locationStrategy map from TopicQueueId to preferred host for processing that partition. * In most cases, use [[LocationStrategy.PreferConsistent]] * @return JavaRDD[MessageExt] */ def createJavaRDD( jsc: JavaSparkContext, groupId: String, offsetRanges: ju.Map[TopicQueueId, Array[OffsetRange]], optionParams: ju.Map[String, String] = new ju.HashMap, locationStrategy: LocationStrategy = PreferConsistent ): JavaRDD[MessageExt] = { new JavaRDD(createRDD(jsc.sc, groupId, offsetRanges, optionParams, locationStrategy)) } /** * Scala constructor for a RocketMq DStream * @param groupId it is for rocketMq for identifying the consumer * @param topics the topics for the rocketmq * @param consumerStrategy consumerStrategy In most cases, pass in [[ConsumerStrategy.lastest]], * see [[ConsumerStrategy]] for more details * @param autoCommit whether commit the offset to the rocketmq server automatically or not. If the user * implement the [[OffsetCommitCallback]], the autoCommit must be set false * @param forceSpecial Generally if the rocketmq server has checkpoint for the [[MessageQueue]], then the consumer * will consume from the checkpoint no matter we specify the offset or not. But if forceSpecial is true, * the rocketmq will start consuming from the specific available offset in any case. * @param failOnDataLoss Zero data lost is not guaranteed when topics are deleted. If zero data lost is critical, * the user must make sure all messages in a topic have been processed when deleting a topic. * @param locationStrategy map from TopicQueueId to preferred host for processing that partition. * In most cases, use [[LocationStrategy.PreferConsistent]] * @param optionParams optional configs, see [[RocketMQConfig]] for more details. * @return InputDStream[MessageExt] */ def createMQPullStream( ssc: StreamingContext, groupId: String, topics: ju.Collection[jl.String], consumerStrategy: ConsumerStrategy, autoCommit: Boolean, forceSpecial: Boolean, failOnDataLoss: Boolean, locationStrategy: LocationStrategy = PreferConsistent, optionParams: ju.Map[String, String] = new ju.HashMap ): InputDStream[MessageExt] = { new MQPullInputDStream(ssc, groupId, topics, optionParams, locationStrategy, consumerStrategy, autoCommit, forceSpecial, failOnDataLoss) } def createMQPullStream( ssc: StreamingContext, groupId: String, topic: String, consumerStrategy: ConsumerStrategy, autoCommit: Boolean, forceSpecial: Boolean, failOnDataLoss: Boolean, optionParams: ju.Map[String, String] ): InputDStream[MessageExt] = { val topics = new ju.ArrayList[String]() topics.add(topic) new MQPullInputDStream(ssc, groupId, topics, optionParams, PreferConsistent, consumerStrategy, autoCommit, forceSpecial, failOnDataLoss) } /** * Java constructor for a RocketMq DStream * @param groupId it is for rocketMq for identifying the consumer * @param topics the topics for the rocketmq * @param consumerStrategy consumerStrategy In most cases, pass in [[ConsumerStrategy.lastest]], * see [[ConsumerStrategy]] for more details * @param autoCommit whether commit the offset to the rocketmq server automatically or not. If the user * implement the [[OffsetCommitCallback]], the autoCommit must be set false * @param forceSpecial Generally if the rocketmq server has checkpoint for the [[MessageQueue]], then the consumer * will consume from the checkpoint no matter we specify the offset or not. But if forceSpecial is true, * the rocketmq will start consuming from the specific available offset in any case. * @param failOnDataLoss Zero data lost is not guaranteed when topics are deleted. If zero data lost is critical, * the user must make sure all messages in a topic have been processed when deleting a topic. * @param locationStrategy map from TopicQueueId to preferred host for processing that partition. * In most cases, use [[LocationStrategy.PreferConsistent]] * @param optionParams optional configs, see [[RocketMQConfig]] for more details. * @return JavaInputDStream[MessageExt] */ def createJavaMQPullStream( ssc: JavaStreamingContext, groupId: String, topics: ju.Collection[jl.String], consumerStrategy: ConsumerStrategy, autoCommit: Boolean, forceSpecial: Boolean, failOnDataLoss: Boolean, locationStrategy: LocationStrategy = PreferConsistent, optionParams: ju.Map[String, String] = new ju.HashMap ): JavaInputDStream[MessageExt] = { val inputDStream = createMQPullStream(ssc.ssc, groupId, topics, consumerStrategy, autoCommit, forceSpecial, failOnDataLoss, locationStrategy, optionParams) new JavaInputDStream(inputDStream) } def createJavaMQPullStream( ssc: JavaStreamingContext, groupId: String, topics: ju.Collection[jl.String], consumerStrategy: ConsumerStrategy, autoCommit: Boolean, forceSpecial: Boolean, failOnDataLoss: Boolean): JavaInputDStream[MessageExt] = { val inputDStream = createMQPullStream(ssc.ssc, groupId, topics, consumerStrategy, autoCommit, forceSpecial, failOnDataLoss) new JavaInputDStream(inputDStream) } def mkPullConsumerInstance(groupId: String, optionParams: ju.Map[String, String], instance: String): DefaultMQPullConsumer = { val consumer = new DefaultMQPullConsumer(groupId) if (optionParams.containsKey(RocketMQConfig.PULL_TIMEOUT_MS)) consumer.setConsumerTimeoutMillisWhenSuspend(optionParams.get(RocketMQConfig.PULL_TIMEOUT_MS).toLong) val finalInstance = optionParams.getOrDefault("consumer.instance", instance) if (StringUtils.isNotBlank(finalInstance)) { consumer.setInstanceName(finalInstance) logger.warn(s"consumer.instance标识为:${finalInstance}") } if (optionParams.containsKey(RocketMQConfig.NAME_SERVER_ADDR)) consumer.setNamesrvAddr(optionParams.get(RocketMQConfig.NAME_SERVER_ADDR)) consumer.start() consumer.setOffsetStore(consumer.getDefaultMQPullConsumerImpl.getOffsetStore) consumer } /** * For creating Java push mode unreliable DStream * @param jssc * @param properties * @param level * @return */ def createJavaMQPushStream( jssc: JavaStreamingContext, properties: Properties, level: StorageLevel ): JavaInputDStream[Message] = createJavaMQPushStream(jssc, properties, level, false) /** * For creating Java push mode reliable DStream * @param jssc * @param properties * @param level * @return */ def createJavaReliableMQPushStream( jssc: JavaStreamingContext, properties: Properties, level: StorageLevel ): JavaInputDStream[Message] = createJavaMQPushStream(jssc, properties, level, true) /** * For creating Java push mode DStream * @param jssc * @param properties * @param level * @param reliable * @return */ def createJavaMQPushStream( jssc: JavaStreamingContext, properties: Properties, level: StorageLevel, reliable: Boolean ): JavaInputDStream[Message] = { if (jssc == null || properties == null || level == null) return null val receiver = if (reliable) new ReliableRocketMQReceiver(properties, level) else new RocketMQReceiver(properties, level) val ds = jssc.receiverStream(receiver) ds } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/spark/sql/rocketmq/CachedRocketMQConsumer.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaDataConsumer.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ * 2. Reuse underlying consumer instance for each consumer group */ package org.apache.spark.sql.rocketmq import java.util.concurrent.TimeoutException import java.{util => ju} import org.apache.commons.lang3.mutable.MutableInt import org.apache.rocketmq.client.consumer.{MQPullConsumer, PullStatus} import org.apache.rocketmq.common.message.{MessageExt, MessageQueue} import org.apache.spark.internal.Logging import org.apache.spark.sql.rocketmq.RocketMQSource._ import org.apache.spark.{SparkEnv, SparkException, TaskContext} import scala.collection.mutable /** * Consumer of single group, intended for cached reuse. Underlying consumer is threadsafe, but processing * the same queue and group id in multiple threads is usually bad anyway. */ private case class CachedRocketMQConsumer( consumer: MQPullConsumer, queue: MessageQueue, options: ju.Map[String, String]) extends Logging { import CachedRocketMQConsumer._ private val groupId = options.get(RocketMQConf.CONSUMER_GROUP) // Since group ID is uniquely generated for each task, the following options must be same for identical group ID // so they are not presented in CacheKey private val subExpression = options.getOrDefault(RocketMQConf.CONSUMER_SUB_EXPRESSION, "*") private val maxBatchSize = options.getOrDefault(RocketMQConf.PULL_MAX_BATCH_SIZE, "32").toInt /** indicates whether this consumer is in use or not */ @volatile var inUse = true /** Iterator to the already fetch data */ @volatile private var fetchedData = ju.Collections.emptyIterator[MessageExt] @volatile private var nextOffsetInFetchedData = UNKNOWN_OFFSET /** * Return the available offset range of the current partition. It's a pair of the earliest offset * and the latest offset. */ def getAvailableOffsetRange(): AvailableOffsetRange = { val earliestOffset = consumer.minOffset(queue) val latestOffset = consumer.maxOffset(queue) AvailableOffsetRange(earliestOffset, latestOffset) } /** * Get the record for the given offset if available. Otherwise it will either throw error * (if failOnDataLoss = true), or return the next available offset within [offset, untilOffset), * or null. * * @param offset the offset to fetch. * @param untilOffset the max offset to fetch. Exclusive. * @param failOnDataLoss When `failOnDataLoss` is `true`, this method will either return record at * offset if available, or throw exception.when `failOnDataLoss` is `false`, * this method will either return record at offset if available, or return * the next earliest available record less than untilOffset, or null. It * will not throw any exception. */ def get( offset: Long, untilOffset: Long, pollTimeoutMs: Long, failOnDataLoss: Boolean): MessageExt = { require(offset < untilOffset, s"offset must always be less than untilOffset [offset: $offset, untilOffset: $untilOffset]") logDebug(s"Get $groupId $queue requested $offset") // The following loop is basically for `failOnDataLoss = false`. When `failOnDataLoss` is // `false`, first, we will try to fetch the record at `offset`. If no such record exists, then // we will move to the next available offset within `[offset, untilOffset)` and retry. // If `failOnDataLoss` is `true`, the loop body will be executed only once. var toFetchOffset = offset var consumerRecord: MessageExt = null // We want to break out of the while loop on a successful fetch to avoid using "return" // which may causes a NonLocalReturnControl exception when this method is used as a function. var isFetchComplete = false while (toFetchOffset != UNKNOWN_OFFSET && !isFetchComplete) { try { consumerRecord = fetchData(toFetchOffset, untilOffset, pollTimeoutMs, failOnDataLoss) isFetchComplete = true } catch { case e: OffsetIllegalException => // When there is some error thrown, reset all states resetFetchedData() reportDataLoss(failOnDataLoss, s"Cannot fetch offset $toFetchOffset: ${e.toString}", e) toFetchOffset = getEarliestAvailableOffsetBetween(toFetchOffset, untilOffset, e.availableOffsetRange) } } if (isFetchComplete) { consumerRecord } else { resetFetchedData() null } } /** * Return the next earliest available offset in [offset, untilOffset). If all offsets in * [offset, untilOffset) are invalid (e.g., the topic is deleted and recreated), it will return * `UNKNOWN_OFFSET`. */ private def getEarliestAvailableOffsetBetween(offset: Long, untilOffset: Long, range: AvailableOffsetRange): Long = { logWarning(s"Some data may be lost. Recovering from the earliest offset: ${range.earliest}") if (offset >= range.latest || range.earliest >= untilOffset) { // [offset, untilOffset) and [earliestOffset, latestOffset) have no overlap, // either // -------------------------------------------------------- // ^ ^ ^ ^ // | | | | // earliestOffset latestOffset offset untilOffset // // or // -------------------------------------------------------- // ^ ^ ^ ^ // | | | | // offset untilOffset earliestOffset latestOffset val warningMessage = s""" |The current available offset range is [${range.earliest}, ${range.latest}). | Offset $offset is out of range, and records in [$offset, $untilOffset) will be | skipped ${additionalMessage(failOnDataLoss = false)} """.stripMargin logWarning(warningMessage) UNKNOWN_OFFSET } else if (offset >= range.earliest) { // ----------------------------------------------------------------------------- // ^ ^ ^ ^ // | | | | // earliestOffset offset min(untilOffset,latestOffset) max(untilOffset, latestOffset) // // This will happen when a topic is deleted and recreated, and new data are pushed very fast, // then we will see `offset` disappears first then appears again. Although the parameters // are same, the state in RocketMQ cluster is changed, so the outer loop won't be endless. logWarning(s"Found a disappeared offset $offset. " + s"Some data may be lost ${additionalMessage(failOnDataLoss = false)}") offset } else { // ------------------------------------------------------------------------------ // ^ ^ ^ ^ // | | | | // offset earliestOffset min(untilOffset,latestOffset) max(untilOffset, latestOffset) val warningMessage = s""" |The current available offset range is [${range.earliest}, ${range.latest}). | Offset $offset is out of range, and records in [$offset, ${range.earliest}) will be | skipped ${additionalMessage(failOnDataLoss = false)} """.stripMargin logWarning(warningMessage) range.earliest } } /** * Get the record for the given offset if available. Otherwise it will either throw error * (if failOnDataLoss = true), or return the next available offset within [offset, untilOffset), * or null. * * @throws OffsetIllegalException if `offset` is out of range * @throws TimeoutException if cannot fetch the record in `pollTimeoutMs` milliseconds. */ private def fetchData( offset: Long, untilOffset: Long, pollTimeoutMs: Long, failOnDataLoss: Boolean): MessageExt = { if (offset != nextOffsetInFetchedData || !fetchedData.hasNext) { // This is the first fetch, or the last pre-fetched data has been drained. val p = consumer.pull(queue, subExpression, offset, maxBatchSize, pollTimeoutMs) if (p.getPullStatus == PullStatus.OFFSET_ILLEGAL){ throw new OffsetIllegalException(AvailableOffsetRange(p.getMinOffset, p.getMaxOffset)) } else if (p.getPullStatus == PullStatus.NO_MATCHED_MSG || p.getPullStatus == PullStatus.NO_NEW_MSG) { throw new IllegalStateException(s"Cannot fetch record for offset $offset in $pollTimeoutMs milliseconds. " + s"status = ${p.getPullStatus.toString}") } fetchedData = p.getMsgFoundList.iterator assert(fetchedData.hasNext) } val record = fetchedData.next() assert(record.getQueueOffset == offset, s"Got wrong record for $groupId ${queue.toString} even after seeking to offset $offset") nextOffsetInFetchedData = record.getQueueOffset + 1 // In general, RocketMQ uses the specified offset as the start point, and tries to fetch the next // available offset. Hence we need to handle offset mismatch. if (record.getQueueOffset > offset) { // This may happen when some records aged out but their offsets already got verified if (failOnDataLoss) { reportDataLoss(true, s"Cannot fetch records in [$offset, ${record.getQueueOffset})") // Never happen as "reportDataLoss" will throw an exception null } else { if (record.getQueueOffset >= untilOffset) { reportDataLoss(false, s"Skip missing records in [$offset, $untilOffset)") null } else { reportDataLoss(false, s"Skip missing records in [$offset, ${record.getQueueOffset})") record } } } else if (record.getQueueOffset < offset) { // This should not happen. If it does happen, then we probably misunderstand RocketMQ internal // mechanism. throw new IllegalStateException( s"Tried to fetch $offset but the returned record offset was ${record.getQueueOffset}") } else { record } } /** Reset the internal pre-fetched data. */ private def resetFetchedData(): Unit = { nextOffsetInFetchedData = UNKNOWN_OFFSET fetchedData = ju.Collections.emptyIterator[MessageExt] } /** * Return an addition message including useful message and instruction. */ private def additionalMessage(failOnDataLoss: Boolean): String = { if (failOnDataLoss) { s"(GroupId: $groupId, MessageQueue: $queue). " + s"$INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE" } else { s"(GroupId: $groupId, MessageQueue: $queue). " + s"$INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE" } } /** * Throw an exception or log a warning as per `failOnDataLoss`. */ private def reportDataLoss( failOnDataLoss: Boolean, message: String, cause: Throwable = null): Unit = { val finalMessage = s"$message ${additionalMessage(failOnDataLoss)}" reportDataLoss0(failOnDataLoss, finalMessage, cause) } def close(): Unit = { // Shutdown the underlying consumer if nobody is using val consumerToShutdown = synchronized { val useCount = groupIdUseCount(groupId).decrementAndGet() if (useCount == 0) { groupIdUseCount.remove(groupId) groupIdToClient.remove(groupId) } else None } if (consumerToShutdown.isDefined) { consumerToShutdown.get.shutdown() } } } private case class AvailableOffsetRange(earliest: Long, latest: Long) private object CachedRocketMQConsumer extends Logging { private val UNKNOWN_OFFSET = -2L private case class CacheKey(groupId: String, queue: MessageQueue) private object CacheKey { def from(queue: MessageQueue, options: ju.Map[String, String]): CacheKey = { CacheKey(options.get(RocketMQConf.CONSUMER_GROUP), queue) } } private class OffsetIllegalException(val availableOffsetRange: AvailableOffsetRange) extends Exception private lazy val cache = { val conf = SparkEnv.get.conf val capacity = conf.getInt(RocketMQConf.PULL_CONSUMER_CACHE_MAX_CAPACITY, 64) new ju.LinkedHashMap[CacheKey, CachedRocketMQConsumer](capacity, 0.75f, true) { override def removeEldestEntry( entry: ju.Map.Entry[CacheKey, CachedRocketMQConsumer]): Boolean = { if (!entry.getValue.inUse && this.size > capacity) { logWarning( s"RocketMQConsumer cache hitting max capacity of $capacity, " + s"removing consumer for ${entry.getKey}") try { entry.getValue.close() } catch { case e: SparkException => logError(s"Error closing earliest RocketMQ consumer for ${entry.getKey}", e) } true } else { false } } } } // The MQPullConsumer client is shared by multiple instances of CachedRocketMQConsumer // because RocketMQ claims there should not be more than one instance for a groupId private val groupIdToClient = mutable.Map[String, MQPullConsumer]() // For cleaning unused clients private val groupIdUseCount = mutable.Map[String, MutableInt]() def releaseConsumer( queue: MessageQueue, options: ju.Map[String, String]): Unit = { val key = CacheKey.from(queue, options) synchronized { val consumer = cache.get(key) if (consumer != null) { consumer.inUse = false } else { logWarning(s"Attempting to release consumer that does not exist") } } } /** * Removes (and closes) the RocketMQ Consumer for the given MessageQueue and groupId. */ def removeConsumer( queue: MessageQueue, options: ju.Map[String, String]): Unit = { val key = CacheKey.from(queue, options) synchronized { val removedConsumer = cache.remove(key) if (removedConsumer != null) { removedConsumer.close() } } } /** * Get a cached consumer for groupId, assigned to topic and partition. * If matching consumer doesn't already exist, will be created using options. */ def getOrCreate( queue: MessageQueue, options: ju.Map[String, String]): CachedRocketMQConsumer = synchronized { val key = CacheKey.from(queue, options) // The MQPullConsumer client is shared by multiple instances of CachedRocketMQConsumer // because RocketMQ claims there should not be more than one instance for a groupId val groupId = options.get(RocketMQConf.CONSUMER_GROUP) val client = synchronized { groupIdUseCount.getOrElseUpdate(groupId, new MutableInt(0)).increment() groupIdToClient.getOrElseUpdate(groupId, RocketMQUtils.makePullConsumer(groupId, options)) } // If this is reattempt at running the task, then invalidate cache and start with // a new consumer if (TaskContext.get != null && TaskContext.get.attemptNumber >= 1) { removeConsumer(queue, options) val consumer = new CachedRocketMQConsumer(client, queue, options) consumer.inUse = true cache.put(key, consumer) consumer } else { if (!cache.containsKey(key)) { cache.put(key, new CachedRocketMQConsumer(client, queue, options)) } val consumer = cache.get(key) consumer.inUse = true consumer } } /** Create an [[CachedRocketMQConsumer]] but don't put it into cache. */ def createUncached( queue: MessageQueue, options: ju.Map[String, String]): CachedRocketMQConsumer = { val groupId = options.get(RocketMQConf.CONSUMER_GROUP) val client = synchronized { groupIdUseCount.getOrElseUpdate(groupId, new MutableInt(0)).increment() groupIdToClient.getOrElseUpdate(groupId, RocketMQUtils.makePullConsumer(groupId, options)) } new CachedRocketMQConsumer(client, queue, options) } private def reportDataLoss0( failOnDataLoss: Boolean, finalMessage: String, cause: Throwable = null): Unit = { if (failOnDataLoss) { if (cause != null) { throw new IllegalStateException(finalMessage, cause) } else { throw new IllegalStateException(finalMessage) } } else { if (cause != null) { logWarning(finalMessage, cause) } else { logWarning(finalMessage) } } } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/spark/sql/rocketmq/CachedRocketMQProducer.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/CachedKafkaProducer.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ * 2. Reuse underlying producer instance for each producer group */ package org.apache.spark.sql.rocketmq import java.util.concurrent._ import java.{util => ju} import com.google.common.cache._ import com.google.common.util.concurrent.{ExecutionError, UncheckedExecutionException} import org.apache.rocketmq.client.producer.DefaultMQProducer import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import scala.util.control.NonFatal private[rocketmq] object CachedRocketMQProducer extends Logging { private type Producer = DefaultMQProducer private lazy val cacheExpireTimeout: Long = SparkEnv.get.conf.getTimeAsMs(RocketMQConf.PRODUCER_CACHE_TIMEOUT, "10m") private val removalListener = new RemovalListener[String, Producer]() { override def onRemoval( notification: RemovalNotification[String, Producer]): Unit = { val group: String = notification.getKey val producer: Producer = notification.getValue logDebug( s"Evicting RocketMQ producer $producer for group $group, due to ${notification.getCause}") close(group, producer) } } private lazy val guavaCache: Cache[String, Producer] = CacheBuilder.newBuilder().expireAfterAccess(cacheExpireTimeout, TimeUnit.MILLISECONDS) .removalListener(removalListener) .build[String, Producer]() /** * Get a cached RocketMQProducer for a given configuration. If matching RocketMQProducer doesn't * exist, a new RocketMQProducer will be created. RocketMQProducer is thread safe, it is best to keep * one instance per specified options. */ def getOrCreate(options: ju.Map[String, String]): Producer = { val group = options.get(RocketMQConf.PRODUCER_GROUP) try { guavaCache.get(group, new Callable[Producer] { override def call(): Producer = { val producer = RocketMQUtils.makeProducer(group, options) logDebug(s"Created a new instance of RocketMQ producer for group $group.") producer } }) } catch { case e @ (_: ExecutionException | _: UncheckedExecutionException | _: ExecutionError) if e.getCause != null => throw e.getCause } } /** For explicitly closing RocketMQ producer */ private def close(options: ju.Map[String, String]): Unit = { val group = options.get(RocketMQConf.PRODUCER_GROUP) guavaCache.invalidate(group) } /** Auto close on cache evict */ private def close(group: String, producer: Producer): Unit = { try { logInfo(s"Closing the RocketMQ producer of group $group") producer.shutdown() } catch { case NonFatal(e) => logWarning("Error while closing RocketMQ producer.", e) } } private def clear(): Unit = { logInfo("Cleaning up guava cache.") guavaCache.invalidateAll() } // Intended for testing purpose only. private def getAsMap: ConcurrentMap[String, Producer] = guavaCache.asMap() } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/spark/sql/rocketmq/JsonUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.sql.rocketmq import java.{util => ju} import org.apache.rocketmq.common.message.MessageQueue import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import scala.collection.JavaConverters._ import scala.util.control.NonFatal /** * Utilities for converting RocketMQ related objects to and from json. */ private object JsonUtils { private implicit val formats = Serialization.formats(NoTypeHints) /** * Read MessageQueues from json string */ def partitions(str: String): Array[MessageQueue] = { try { Serialization.read[Map[String, Map[String, Seq[Int]]]](str).flatMap { case (topic, broker) => broker.flatMap { bq => bq._2.map(qid => new MessageQueue(topic, bq._1, qid)) } }.toArray } catch { case NonFatal(x) => throw new IllegalArgumentException( s"""Expected e.g. {"topicA":{"broker1":[0,1],"broker2":[0,1]},"topicB":{"broker3":[0,1]}}, got $str""") } } /** * Write MessageQueues as json string */ def partitions(mqs: Iterable[MessageQueue]): String = { var result = Map[String, Map[String, List[Int]]]() mqs.foreach { q => var brokers = result.getOrElse(q.getTopic, Map.empty) var queueIds = brokers.getOrElse(q.getBrokerName, List.empty) queueIds = queueIds :+ q.getQueueId brokers += q.getBrokerName -> queueIds result += q.getTopic -> brokers } Serialization.write(result) } /** * Read per-MessageQueue offsets from json string */ def partitionOffsets(str: String): Map[MessageQueue, Long] = { try { Serialization.read[Map[String, Map[String, Map[Int, Long]]]](str).flatMap { case (topic, brokers) => brokers.flatMap { case (broker, queues) => queues.map { case (queue, offset) => new MessageQueue(topic, broker, queue) -> offset } } }.toMap } catch { case NonFatal(x) => throw new IllegalArgumentException( s"""Expected e.g. {"topicA":{"broker1":{"0":23,"1":-1},"broker2":{"0":23}},"topicB":{"broker3":{"0":-2}}}, got $str""") } } /** * Write per-MessageQueue offsets as json string */ def partitionOffsets(queueOffsets: Map[MessageQueue, Long]): String = { var result = Map[String, Map[String, Map[Int, Long]]]() val partitions = queueOffsets.keySet.toSeq.sorted // sort for more determinism partitions.foreach { q => val offset = queueOffsets(q) var brokers = result.getOrElse(q.getTopic, Map.empty) var queues = brokers.getOrElse(q.getBrokerName, Map.empty) queues += q.getQueueId -> offset brokers += q.getBrokerName -> queues result += q.getTopic -> brokers } Serialization.write(result) } /** * Serialize RocketMQ message properties as json string */ def messageProperties(properties: ju.Map[String, String]): String = { Serialization.write(properties.asScala) } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/spark/sql/rocketmq/RocketMQConf.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.sql.rocketmq /** * Options for RocketMQ consumer or producer client. * * See also [[RocketMQSourceProvider]] */ object RocketMQConf { //******************************* // Shared Options //******************************* val NAME_SERVER_ADDR = "nameserver" //******************************* // Source (Consumer) Options //******************************* val CONSUMER_GROUP = "group" val CONSUMER_TOPIC = "topic" // What point should be consuming from (options: "earliest", "latest", default: "latest") val CONSUMER_OFFSET = "startingoffsets" // Subscription expression (default: "*") val CONSUMER_SUB_EXPRESSION = "subexpression" // To pick up the consume speed, the consumer can pull a batch of messages at a time (default: 32) val PULL_MAX_BATCH_SIZE = "pullbatchsize" // Pull timeout for the consumer (default: 3000) val PULL_TIMEOUT_MS = "pulltimeoutms" //******************************* // Sink (Producer) Options //******************************* val PRODUCER_GROUP = "group" // Default topic of produced messages if `topic` is not among the attributes val PRODUCER_TOPIC = "topic" //******************************* // Spark Context Options //******************************* // Max number of cached pull consumer (default: 64) val PULL_CONSUMER_CACHE_MAX_CAPACITY = "spark.sql.rocketmq.pull.consumer.cache.maxCapacity" // Producer cache timeout (default: "10m") val PRODUCER_CACHE_TIMEOUT = "spark.rocketmq.producer.cache.timeout" } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/spark/sql/rocketmq/RocketMQOffsetRangeLimit.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaOffsetRangeLimit.scala * * There are some modifications: * 1. Adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import org.apache.rocketmq.common.message.MessageQueue /** * Objects that represent desired offset range limits for starting, * ending, and specific offsets. */ private[rocketmq] sealed trait RocketMQOffsetRangeLimit /** * Represents the desire to bind to the earliest offsets in RocketMQ */ private[rocketmq] case object EarliestOffsetRangeLimit extends RocketMQOffsetRangeLimit /** * Represents the desire to bind to the latest offsets in RocketMQ */ private[rocketmq] case object LatestOffsetRangeLimit extends RocketMQOffsetRangeLimit /** * Represents the desire to bind to specific offsets. A offset == -1 binds to the * latest offset, and offset == -2 binds to the earliest offset. */ private[rocketmq] case class SpecificOffsetRangeLimit( partitionOffsets: Map[MessageQueue, Long]) extends RocketMQOffsetRangeLimit private[rocketmq] object RocketMQOffsetRangeLimit { /** * Used to denote offset range limits that are resolved via RocketMQ */ val LATEST = -1L // indicates resolution to the latest offset val EARLIEST = -2L // indicates resolution to the earliest offset } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/spark/sql/rocketmq/RocketMQOffsetReader.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaOffsetReader.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import java.{util => ju} import org.apache.rocketmq.client.consumer.MQPullConsumer import org.apache.rocketmq.common.message.MessageQueue import org.apache.spark.internal.Logging import scala.collection.JavaConverters._ import scala.util.control.NonFatal /** * This class uses RocketMQ's own [[MQPullConsumer]] API to read data offsets from RocketMQ. */ private class RocketMQOffsetReader( driverRocketMQParams: ju.Map[String, String], readerOptions: Map[String, String], driverGroupIdPrefix: String) extends Logging { val topic: String = driverRocketMQParams.get(RocketMQConf.CONSUMER_TOPIC) /** * Place [[groupId]] and [[nextId]] here so that they are initialized before any consumer is * created -- see SPARK-19564. */ private var groupId: String = _ private var nextId = 0 /** * A RocketMQConsumer used in the driver to query the latest RocketMQ offsets. This only queries the * offsets and never commits them. */ protected var consumer: MQPullConsumer = createConsumer() private val maxOffsetFetchAttempts = readerOptions.getOrElse("fetchOffset.numRetries", "3").toInt private val offsetFetchAttemptIntervalMs = readerOptions.getOrElse("fetchOffset.retryIntervalMs", "1000").toLong private def nextGroupId(): String = { groupId = driverGroupIdPrefix + "-" + nextId nextId += 1 groupId } /** * Closes the connection to RocketMQ, and cleans up state. */ def close(): Unit = { consumer.shutdown() } /** * @return The Set of MessageQueue for a given topic */ def fetchTopicPartitions(): Set[MessageQueue] = { val partitions = consumer.fetchSubscribeMessageQueues(topic) partitions.asScala.toSet } /** * Resolves the specific offsets based on RocketMQ seek positions. * This method resolves offset value -1 to the latest and -2 to the * earliest RocketMQ seek position. * * @param partitionOffsets the specific offsets to resolve * @param reportDataLoss callback to either report or log data loss depending on setting */ def fetchSpecificOffsets( partitionOffsets: Map[MessageQueue, Long], reportDataLoss: String => Unit): RocketMQSourceOffset = { val fetched = { withRetries { val partitions = consumer.fetchSubscribeMessageQueues(topic) assert(partitions.asScala == partitionOffsets.keySet, "If startingOffsets contains specific offsets, you must specify all TopicPartitions.\n" + "Use -1 for latest, -2 for earliest, if you don't care.\n" + s"Specified: ${partitionOffsets.keySet} Assigned: ${partitions.asScala}") logDebug(s"Partitions assigned to consumer: $partitions. Seeking to $partitionOffsets") partitionOffsets.foreach { case (mq, RocketMQOffsetRangeLimit.LATEST) => consumer.updateConsumeOffset(mq, consumer.maxOffset(mq)) case (mq, RocketMQOffsetRangeLimit.EARLIEST) => consumer.updateConsumeOffset(mq, consumer.minOffset(mq)) case (mq, offset) => consumer.updateConsumeOffset(mq, offset) } partitionOffsets.map { case (mq, _) => mq -> consumer.fetchConsumeOffset(mq, false) } } } partitionOffsets.foreach { case (tp, off) if off != RocketMQOffsetRangeLimit.LATEST && off != RocketMQOffsetRangeLimit.EARLIEST => if (fetched(tp) != off) { reportDataLoss( s"startingOffsets for $tp was $off but consumer reset to ${fetched(tp)}") } case _ => // no real way to check that beginning or end is reasonable } RocketMQSourceOffset(fetched) } /** * Fetch the earliest offsets for the topic partitions */ def fetchEarliestOffsets(): Map[MessageQueue, Long] = { withRetries { val partitions = consumer.fetchSubscribeMessageQueues(topic) logDebug(s"Partitions assigned to consumer: $partitions. Seeking to the beginning") val partitionOffsets = partitions.asScala.map(p => p -> consumer.minOffset(p)).toMap logDebug(s"Got earliest offsets for partition : $partitionOffsets") partitionOffsets } } /** * Fetch the latest offsets for the topic partitions */ def fetchLatestOffsets(): Map[MessageQueue, Long] = { withRetries { val partitions = consumer.fetchSubscribeMessageQueues(topic) logDebug(s"Partitions assigned to consumer: $partitions. Seeking to the end.") val partitionOffsets = partitions.asScala.map(p => p -> consumer.maxOffset(p)).toMap logDebug(s"Got latest offsets for partition : $partitionOffsets") partitionOffsets } } /** * Fetch the earliest offsets for specific topic partitions. * The return result may not contain some partitions if they are deleted. */ def fetchEarliestOffsets( newPartitions: Seq[MessageQueue]): Map[MessageQueue, Long] = { if (newPartitions.isEmpty) { Map.empty[MessageQueue, Long] } else { withRetries { val partitions = consumer.fetchSubscribeMessageQueues(topic) logDebug(s"\tPartitions assigned to consumer: $partitions") // Get the earliest offset of each partition val partitionOffsets = newPartitions.filter { p => // When deleting topics happen at the same time, some partitions may not be in // `partitions`. So we need to ignore them partitions.contains(p) }.map(p => p -> consumer.minOffset(p)).toMap logDebug(s"Got earliest offsets for new partitions: $partitionOffsets") partitionOffsets } } } /** * Helper function that does multiple retries on a body of code that returns offsets. * Retries are needed to handle transient failures. For e.g. race conditions between getting * assignment and getting position while topics/partitions are deleted can cause NPEs. */ private def withRetries( body: => Map[MessageQueue, Long]): Map[MessageQueue, Long] = synchronized { var result: Option[Map[MessageQueue, Long]] = None var attempt = 1 var lastException: Throwable = null while (result.isEmpty && attempt <= maxOffsetFetchAttempts) { try { result = Some(body) } catch { case NonFatal(e) => lastException = e logWarning(s"Error in attempt $attempt getting RocketMQ offsets: ", e) attempt += 1 Thread.sleep(offsetFetchAttemptIntervalMs) resetConsumer() } } if (result.isEmpty) { assert(attempt > maxOffsetFetchAttempts) assert(lastException != null) throw lastException } result.get } /** * Create a consumer using the new generated group id. We always use a new consumer to avoid * just using a broken consumer to retry on RocketMQ errors, which likely will fail again. */ private def createConsumer(): MQPullConsumer = synchronized { val newRocketMQParams = new ju.HashMap[String, String](driverRocketMQParams) val groupId = nextGroupId() RocketMQUtils.makePullConsumer(groupId, newRocketMQParams) } private def resetConsumer(): Unit = synchronized { consumer.shutdown() consumer = createConsumer() } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/spark/sql/rocketmq/RocketMQRelation.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaRelation.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ * 2. Schema of output dataframe adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import java.util.UUID import org.apache.rocketmq.common.message.MessageQueue import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.sources.{BaseRelation, TableScan} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.unsafe.types.UTF8String private[rocketmq] class RocketMQRelation( override val sqlContext: SQLContext, sourceOptions: Map[String, String], optionParams: Map[String, String], failOnDataLoss: Boolean, startingOffsets: RocketMQOffsetRangeLimit, endingOffsets: RocketMQOffsetRangeLimit) extends BaseRelation with TableScan with Logging { assert(startingOffsets != LatestOffsetRangeLimit, "Starting offset not allowed to be set to latest offsets.") assert(endingOffsets != EarliestOffsetRangeLimit, "Ending offset not allowed to be set to earliest offsets.") private val pollTimeoutMs = sourceOptions.getOrElse( RocketMQConf.PULL_TIMEOUT_MS, sqlContext.sparkContext.conf.getTimeAsMs("spark.network.timeout", "120s").toString ).toLong override def schema: StructType = RocketMQSource.schema override def buildScan(): RDD[Row] = { // Each running query should use its own group id. Otherwise, the query may be only assigned // partial data since RocketMQ will assign partitions to multiple consumers having the same group // id. Hence, we should generate a unique id for each query. val uniqueGroupId = s"spark-rocketmq-relation-${UUID.randomUUID}" val offsetReader = new RocketMQOffsetReader( RocketMQSourceProvider.paramsForDriver(optionParams), sourceOptions, driverGroupIdPrefix = s"$uniqueGroupId-driver") // Leverage the RocketMQReader to obtain the relevant partition offsets val (fromPartitionOffsets, untilPartitionOffsets) = { try { (getPartitionOffsets(offsetReader, startingOffsets), getPartitionOffsets(offsetReader, endingOffsets)) } finally { offsetReader.close() } } // Obtain topicPartitions in both from and until partition offset, ignoring // topic partitions that were added and/or deleted between the two above calls. if (fromPartitionOffsets.keySet != untilPartitionOffsets.keySet) { implicit val topicOrdering: Ordering[MessageQueue] = Ordering.by(t => t.getTopic) val fromTopics = fromPartitionOffsets.keySet.toList.sorted.mkString(",") val untilTopics = untilPartitionOffsets.keySet.toList.sorted.mkString(",") throw new IllegalStateException("different topic partitions " + s"for starting offsets topics[$fromTopics] and " + s"ending offsets topics[$untilTopics]") } // Calculate offset ranges val offsetRanges = untilPartitionOffsets.keySet.map { tp => val fromOffset = fromPartitionOffsets.getOrElse(tp, { // This should not happen since messageQueues contains all partitions not in // fromPartitionOffsets throw new IllegalStateException(s"$tp doesn't have a from offset") }) val untilOffset = untilPartitionOffsets(tp) RocketMQSourceRDDOffsetRange(tp, fromOffset, untilOffset, None) }.toArray logInfo("GetBatch generating RDD of offset range: " + offsetRanges.sortBy(_.messageQueue.toString).mkString(", ")) // Create an RDD that reads from RocketMQ and get the (key, value) pair as byte arrays. val executorRocketMQParams = RocketMQSourceProvider.paramsForExecutors(optionParams, uniqueGroupId) val rdd = new RocketMQSourceRDD( sqlContext.sparkContext, executorRocketMQParams, offsetRanges, pollTimeoutMs, failOnDataLoss, reuseRocketMQConsumer = false).map { cr => // Remove the `brokerName` property which was added by us. See `RocketMQSourceRDD.compute` val brokerName = cr.getProperties.remove(RocketMQSource.PROP_BROKER_NAME) InternalRow( UTF8String.fromString(cr.getTopic), // topic cr.getFlag, // flag cr.getBody, // body UTF8String.fromString(JsonUtils.messageProperties(cr.getProperties)), // properties UTF8String.fromString(brokerName), // brokerName cr.getQueueId, // queueId cr.getQueueOffset, // queueOffset DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.getBornTimestamp)), // bornTimestamp DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.getStoreTimestamp)) // storeTimestamp ) } sqlContext.internalCreateDataFrame(rdd, schema).rdd } private def getPartitionOffsets( offsetReader: RocketMQOffsetReader, offsetRangeLimit: RocketMQOffsetRangeLimit): Map[MessageQueue, Long] = { def validateTopicPartitions(partitions: Set[MessageQueue], partitionOffsets: Map[MessageQueue, Long]): Map[MessageQueue, Long] = { assert(partitions == partitionOffsets.keySet, "If startingOffsets contains specific offsets, you must specify all TopicPartitions.\n" + "Use -1 for latest, -2 for earliest, if you don't care.\n" + s"Specified: ${partitionOffsets.keySet} Assigned: $partitions") logDebug(s"Partitions assigned to consumer: $partitions. Seeking to $partitionOffsets") partitionOffsets } val partitions = offsetReader.fetchTopicPartitions() // Obtain MessageQueue offsets with late binding support offsetRangeLimit match { case EarliestOffsetRangeLimit => partitions.map { case tp => tp -> RocketMQOffsetRangeLimit.EARLIEST }.toMap case LatestOffsetRangeLimit => partitions.map { case tp => tp -> RocketMQOffsetRangeLimit.LATEST }.toMap case SpecificOffsetRangeLimit(partitionOffsets) => validateTopicPartitions(partitions, partitionOffsets) } } override def toString: String = s"RocketMQRelation(start=$startingOffsets, end=$endingOffsets)" } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/spark/sql/rocketmq/RocketMQSink.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSink.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import java.{util => ju} import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.execution.streaming.Sink private[rocketmq] class RocketMQSink( sqlContext: SQLContext, executorRocketMQParams: ju.Map[String, String], topic: Option[String]) extends Sink with Logging { @volatile private var latestBatchId = -1L override def toString: String = "RocketMQSink" override def addBatch(batchId: Long, data: DataFrame): Unit = { if (batchId <= latestBatchId) { logInfo(s"Skipping already committed batch $batchId") } else { RocketMQWriter.write(sqlContext.sparkSession, data.queryExecution, executorRocketMQParams, topic) latestBatchId = batchId } } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/spark/sql/rocketmq/RocketMQSourceProvider.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSourceProvider.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ * 2. Schema of output dataframe adapted to RocketMQ * 3. Trait `StreamWriteSupport` and `ContinuousReadSupport` is not supported yet */ package org.apache.spark.sql.rocketmq import java.util.{Locale, UUID} import java.{util => ju} import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, DataFrame, SQLContext, SaveMode} import org.apache.spark.sql.execution.streaming.{Sink, Source} import org.apache.spark.sql.sources._ import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType import scala.collection.JavaConverters._ /** * The provider class for the [[RocketMQSource]]. This provider is designed such that it throws * IllegalArgumentException when the RocketMQ Dataset is created, so that it can catch * missing options even before the query is started. */ class RocketMQSourceProvider extends DataSourceRegister with StreamSourceProvider with RelationProvider with CreatableRelationProvider with StreamSinkProvider with Logging { import RocketMQSourceProvider._ override def shortName(): String = "rocketmq" /** * Returns the name and schema of the source. In addition, it also verifies whether the options * are correct and sufficient to create the [[RocketMQSource]] when the query is started. */ override def sourceSchema( sqlContext: SQLContext, schema: Option[StructType], providerName: String, parameters: Map[String, String]): (String, StructType) = { validateStreamOptions(parameters) require(schema.isEmpty, "RocketMQ source has a fixed schema and cannot be set with a custom one") (shortName(), RocketMQSource.schema) } override def createSource( sqlContext: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { validateStreamOptions(parameters) // Each running query should use its own group id. Otherwise, the query may be only assigned // partial data since RocketMQ will assign partitions to multiple consumers having the same group // id. Hence, we should generate a unique id for each query. val uniqueGroupId = s"spark-rocketmq-source-${UUID.randomUUID}-${metadataPath.hashCode.toHexString}" val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) } val startingStreamOffsets = RocketMQSourceProvider.getRocketMQOffsetRangeLimit(caseInsensitiveParams, RocketMQConf.CONSUMER_OFFSET, LatestOffsetRangeLimit) val offsetReader = new RocketMQOffsetReader( paramsForDriver(caseInsensitiveParams), parameters, driverGroupIdPrefix = s"$uniqueGroupId-driver") new RocketMQSource( sqlContext, offsetReader, paramsForExecutors(caseInsensitiveParams, uniqueGroupId), parameters, metadataPath, startingStreamOffsets, failOnDataLoss(caseInsensitiveParams)) } /** * Returns a new base relation with the given parameters. * * @note The parameters' keywords are case insensitive and this insensitivity is enforced * by the Map that is passed to the function. */ override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { validateBatchOptions(parameters) val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) } val startingRelationOffsets = RocketMQSourceProvider.getRocketMQOffsetRangeLimit(caseInsensitiveParams, STARTING_OFFSETS_OPTION_KEY, EarliestOffsetRangeLimit) assert(startingRelationOffsets != LatestOffsetRangeLimit) val endingRelationOffsets = RocketMQSourceProvider.getRocketMQOffsetRangeLimit(caseInsensitiveParams, ENDING_OFFSETS_OPTION_KEY, LatestOffsetRangeLimit) assert(endingRelationOffsets != EarliestOffsetRangeLimit) new RocketMQRelation( sqlContext, sourceOptions = parameters, optionParams = caseInsensitiveParams, failOnDataLoss = failOnDataLoss(caseInsensitiveParams), startingOffsets = startingRelationOffsets, endingOffsets = endingRelationOffsets) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { mode match { case SaveMode.Overwrite | SaveMode.Ignore => throw new AnalysisException(s"Save mode $mode not allowed for RocketMQ. " + s"Allowed save modes are ${SaveMode.Append} and " + s"${SaveMode.ErrorIfExists} (default).") case _ => // good } val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) } val defaultTopic = parameters.get(RocketMQConf.PRODUCER_TOPIC).map(_.trim) val uniqueGroupId = s"spark-rocketmq-sink-${UUID.randomUUID}" val specifiedKafkaParams = paramsForProducer(caseInsensitiveParams, uniqueGroupId) RocketMQWriter.write(sqlContext.sparkSession, data.queryExecution, specifiedKafkaParams, defaultTopic) /* This method is suppose to return a relation that reads the data that was written. * We cannot support this for RocketMQ. Therefore, in order to make things consistent, * we return an empty base relation. */ new BaseRelation { override def sqlContext: SQLContext = unsupportedException override def schema: StructType = unsupportedException override def needConversion: Boolean = unsupportedException override def sizeInBytes: Long = unsupportedException override def unhandledFilters(filters: Array[Filter]): Array[Filter] = unsupportedException private def unsupportedException = throw new UnsupportedOperationException("BaseRelation from RocketMQ write " + "operation is not usable.") } } private def failOnDataLoss(caseInsensitiveParams: Map[String, String]) = caseInsensitiveParams.getOrElse(FAIL_ON_DATA_LOSS_OPTION_KEY, "true").toBoolean private def validateGeneralOptions(caseInsensitiveParams: Map[String, String]) { // Validate source options if (!caseInsensitiveParams.contains(RocketMQConf.CONSUMER_TOPIC)) { throw new IllegalArgumentException(s"Option '${RocketMQConf.CONSUMER_TOPIC}' must be specified for RocketMQ source") } } override def createSink( sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) } val defaultTopic = parameters.get(RocketMQConf.PRODUCER_TOPIC).map(_.trim) val uniqueGroupId = s"spark-rocketmq-sink-${UUID.randomUUID}" new RocketMQSink(sqlContext, paramsForProducer(caseInsensitiveParams, uniqueGroupId), defaultTopic) } private def validateStreamOptions(caseInsensitiveParams: Map[String, String]) { // Stream specific options validateGeneralOptions(caseInsensitiveParams) } private def validateBatchOptions(caseInsensitiveParams: Map[String, String]) { // Batch specific options RocketMQSourceProvider.getRocketMQOffsetRangeLimit( caseInsensitiveParams, STARTING_OFFSETS_OPTION_KEY, EarliestOffsetRangeLimit) match { case EarliestOffsetRangeLimit => // good to go case LatestOffsetRangeLimit => throw new IllegalArgumentException("starting offset can't be latest " + "for batch queries on RocketMQ") case SpecificOffsetRangeLimit(partitionOffsets) => partitionOffsets.foreach { case (mq, off) if off == RocketMQOffsetRangeLimit.LATEST => throw new IllegalArgumentException(s"starting offsets for $mq can't be latest for batch queries on RocketMQ") case _ => // ignore } } RocketMQSourceProvider.getRocketMQOffsetRangeLimit( caseInsensitiveParams, ENDING_OFFSETS_OPTION_KEY, LatestOffsetRangeLimit) match { case EarliestOffsetRangeLimit => throw new IllegalArgumentException("ending offset can't be earliest " + "for batch queries on RocketMQ") case LatestOffsetRangeLimit => // good to go case SpecificOffsetRangeLimit(partitionOffsets) => partitionOffsets.foreach { case (mq, off) if off == RocketMQOffsetRangeLimit.EARLIEST => throw new IllegalArgumentException(s"ending offset for $mq can't be " + "earliest for batch queries on RocketMQ") case _ => // ignore } } validateGeneralOptions(caseInsensitiveParams) // Don't want to throw an error, but at least log a warning. if (caseInsensitiveParams.get("maxoffsetspertrigger").isDefined) { logWarning("maxOffsetsPerTrigger option ignored in batch queries") } } } object RocketMQSourceProvider extends Logging { private[rocketmq] val STARTING_OFFSETS_OPTION_KEY = "startingoffsets" private[rocketmq] val ENDING_OFFSETS_OPTION_KEY = "endingoffsets" private[rocketmq] val FAIL_ON_DATA_LOSS_OPTION_KEY = "failondataloss" def getRocketMQOffsetRangeLimit( params: Map[String, String], offsetOptionKey: String, defaultOffsets: RocketMQOffsetRangeLimit): RocketMQOffsetRangeLimit = { params.get(offsetOptionKey).map(_.trim) match { case Some(offset) if offset.toLowerCase(Locale.ROOT) == "latest" => LatestOffsetRangeLimit case Some(offset) if offset.toLowerCase(Locale.ROOT) == "earliest" => EarliestOffsetRangeLimit case Some(json) => SpecificOffsetRangeLimit(JsonUtils.partitionOffsets(json)) case None => defaultOffsets } } def paramsForDriver(specifiedRocketMQParams: Map[String, String]): ju.Map[String, String] = { if (specifiedRocketMQParams.contains(RocketMQConf.CONSUMER_GROUP)) { throw new IllegalArgumentException( s"Option '${RocketMQConf.CONSUMER_GROUP}' can not be specified") } ConfigUpdater("source", specifiedRocketMQParams) // Set to "earliest" to avoid exceptions. However, RocketMQSource will fetch the initial // offsets by itself instead of counting on RocketMQConsumer. .set(RocketMQConf.CONSUMER_OFFSET, "earliest") // So that the driver does not pull too much data .set(RocketMQConf.PULL_MAX_BATCH_SIZE, "1") .build() } def paramsForExecutors( specifiedRocketMQParams: Map[String, String], uniqueGroupId: String): ju.Map[String, String] = { if (specifiedRocketMQParams.contains(RocketMQConf.CONSUMER_GROUP)) { throw new IllegalArgumentException( s"Option '${RocketMQConf.CONSUMER_GROUP}' can not be specified") } ConfigUpdater("executor", specifiedRocketMQParams) // So that consumers in executors do not mess with any existing group id .set(RocketMQConf.CONSUMER_GROUP, s"$uniqueGroupId-executor") .build() } def paramsForProducer( specifiedRocketMQParams: Map[String, String], uniqueGroupId: String): ju.Map[String, String] = { if (specifiedRocketMQParams.contains(RocketMQConf.PRODUCER_GROUP)) { throw new IllegalArgumentException( s"Option '${RocketMQConf.PRODUCER_GROUP}' can not be specified") } ConfigUpdater("executor", specifiedRocketMQParams) // So that consumers in executors do not mess with any existing group id .set(RocketMQConf.PRODUCER_GROUP, uniqueGroupId) .build() } /** Class to conveniently update RocketMQ config params, while logging the changes */ private case class ConfigUpdater(module: String, params: Map[String, String]) { private val map = new ju.HashMap[String, String](params.asJava) def set(key: String, value: String): this.type = { map.put(key, value) logDebug(s"$module: Set $key to $value, earlier value: ${params.getOrElse(key, "")}") this } def setIfUnset(key: String, value: String): ConfigUpdater = { if (!map.containsKey(key)) { map.put(key, value) logDebug(s"$module: Set $key to $value") } this } def build(): ju.Map[String, String] = map } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/spark/sql/rocketmq/RocketMQUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.sql.rocketmq import java.{util => ju} import org.apache.rocketmq.client.consumer.DefaultMQPullConsumer import org.apache.rocketmq.client.producer.DefaultMQProducer /** * Some helper methods of RocketMQ */ object RocketMQUtils { def makePullConsumer(groupId: String, optionParams: ju.Map[String, String]): DefaultMQPullConsumer = { val consumer = new DefaultMQPullConsumer(groupId) if (optionParams.containsKey(RocketMQConf.NAME_SERVER_ADDR)) { consumer.setNamesrvAddr(optionParams.get(RocketMQConf.NAME_SERVER_ADDR)) } consumer.start() consumer.setOffsetStore(consumer.getDefaultMQPullConsumerImpl.getOffsetStore) consumer } def makeProducer(groupId: String, optionParams: ju.Map[String, String]): DefaultMQProducer = { val producer = new DefaultMQProducer(groupId) if (optionParams.containsKey(RocketMQConf.NAME_SERVER_ADDR)) { producer.setNamesrvAddr(optionParams.get(RocketMQConf.NAME_SERVER_ADDR)) } producer.start() producer } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/spark/sql/rocketmq/RocketMQWriteTask.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaWriteTask.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import java.{util => ju} import org.apache.rocketmq.client.producer.{DefaultMQProducer, SendCallback, SendResult} import org.apache.rocketmq.common.message.Message import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal, UnsafeProjection} import org.apache.spark.sql.types.{BinaryType, StringType} /** * Writes out data in a single Spark task, without any concerns about how * to commit or abort tasks. Exceptions thrown by the implementation of this class will * automatically trigger task aborts. */ private[rocketmq] class RocketMQWriteTask( options: ju.Map[String, String], inputSchema: Seq[Attribute], topic: Option[String]) extends RocketMQRowWriter(inputSchema, topic) { // used to synchronize with RocketMQ callbacks private var producer: DefaultMQProducer = _ /** * Writes key value data out to topics. */ def execute(iterator: Iterator[InternalRow]): Unit = { producer = CachedRocketMQProducer.getOrCreate(options) while (iterator.hasNext && failedWrite == null) { val currentRow = iterator.next() sendRow(currentRow, producer) } } def close(): Unit = { checkForErrors() if (producer != null) { checkForErrors() producer = null } } } private[rocketmq] abstract class RocketMQRowWriter( inputSchema: Seq[Attribute], topic: Option[String]) { // used to synchronize with RocketMQ callbacks @volatile protected var failedWrite: Throwable = _ protected val projection = createProjection private val callback = new SendCallback { override def onSuccess(sendResult: SendResult): Unit = {} override def onException(e: Throwable): Unit = { if (failedWrite == null) failedWrite = e } } /** * Send the specified row to the producer, with a callback that will save any exception * to failedWrite. Note that send is asynchronous; subclasses must flush() their producer before * assuming the row is in RocketMQ. */ protected def sendRow( row: InternalRow, producer: DefaultMQProducer): Unit = { val projectedRow = projection(row) val topic = projectedRow.getString(0) val keys = if (projectedRow.isNullAt(1)) null else projectedRow.getString(1) val body = projectedRow.getBinary(2) if (topic == null) { throw new NullPointerException(s"null topic present in the data. Use the " + s"${RocketMQConf.PRODUCER_TOPIC} option for setting a default topic.") } val record = new Message(topic, keys, body) producer.send(record, callback) // send asynchronously } protected def checkForErrors(): Unit = { if (failedWrite != null) { throw failedWrite } } private def createProjection = { val topicExpression = topic.map(Literal(_)).orElse { inputSchema.find(_.name == RocketMQWriter.TOPIC_ATTRIBUTE_NAME) }.getOrElse { throw new IllegalStateException(s"topic option required when no " + s"'${RocketMQWriter.TOPIC_ATTRIBUTE_NAME}' attribute is present") } topicExpression.dataType match { case StringType => // good case t => throw new IllegalStateException(s"${RocketMQWriter.TOPIC_ATTRIBUTE_NAME} " + s"attribute unsupported type $t. ${RocketMQWriter.TOPIC_ATTRIBUTE_NAME} " + "must be a StringType") } val tagsExpression = inputSchema.find(_.name == RocketMQWriter.TAGS_ATTRIBUTE_NAME) .getOrElse(Literal(null, StringType)) tagsExpression.dataType match { case StringType => // good case t => throw new IllegalStateException(s"${RocketMQWriter.TAGS_ATTRIBUTE_NAME} " + s"attribute unsupported type $t") } val bodyExpression = inputSchema.find(_.name == RocketMQWriter.BODY_ATTRIBUTE_NAME).getOrElse( throw new IllegalStateException(s"Required attribute '${RocketMQWriter.BODY_ATTRIBUTE_NAME}' not found") ) bodyExpression.dataType match { case StringType | BinaryType => // good case t => throw new IllegalStateException(s"${RocketMQWriter.BODY_ATTRIBUTE_NAME} " + s"attribute unsupported type $t") } UnsafeProjection.create( Seq(topicExpression, tagsExpression, Cast(bodyExpression, BinaryType)), inputSchema) } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/spark/sql/rocketmq/RocketMQWriter.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaWriter.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ * 2. Input attributes are adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import java.{util => ju} import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} import org.apache.spark.sql.types.{BinaryType, StringType} import org.apache.spark.util.Utils /** * The [[RocketMQWriter]] class is used to write data from a batch query * or structured streaming query, given by a [[QueryExecution]], to RocketMQ. * The data is assumed to have body column and an optional topic and tag * column. If the topic column is missing, then the topic must come from * the 'topic' configuration option. */ private object RocketMQWriter extends Logging { val TOPIC_ATTRIBUTE_NAME: String = "topic" val TAGS_ATTRIBUTE_NAME: String = "tags" val BODY_ATTRIBUTE_NAME: String = "body" override def toString: String = "RocketMQWriter" def validateQuery( schema: Seq[Attribute], options: ju.Map[String, String], topic: Option[String] = None): Unit = { schema.find(_.name == TOPIC_ATTRIBUTE_NAME).getOrElse( if (topic.isEmpty) { throw new AnalysisException(s"topic option required when no " + s"'$TOPIC_ATTRIBUTE_NAME' attribute is present. Use the " + s"${RocketMQConf.PRODUCER_TOPIC} option for setting a topic.") } else { Literal(topic.get, StringType) } ).dataType match { case StringType => // good case _ => throw new AnalysisException(s"Topic type must be a String") } schema.find(_.name == TAGS_ATTRIBUTE_NAME).getOrElse( Literal(null, StringType) ).dataType match { case StringType => // good case _ => throw new AnalysisException(s"$TAGS_ATTRIBUTE_NAME attribute type must be String") } schema.find(_.name == BODY_ATTRIBUTE_NAME).getOrElse( throw new AnalysisException(s"Required attribute '$BODY_ATTRIBUTE_NAME' not found") ).dataType match { case StringType | BinaryType => // good case _ => throw new AnalysisException(s"$BODY_ATTRIBUTE_NAME attribute type " + s"must be a String or BinaryType") } } def write( sparkSession: SparkSession, queryExecution: QueryExecution, options: ju.Map[String, String], topic: Option[String] = None): Unit = { val schema = queryExecution.analyzed.output validateQuery(schema, options, topic) queryExecution.toRdd.foreachPartition { iter => val writeTask = new RocketMQWriteTask(options, schema, topic) Utils.tryWithSafeFinally(block = writeTask.execute(iter))( finallyBlock = writeTask.close()) } } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala/org/apache/spark/streaming/MQPullInputDStream.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.streaming import org.apache.rocketmq.client.consumer.DefaultMQPullConsumer import org.apache.rocketmq.client.consumer.store.ReadOffsetType import org.apache.rocketmq.common.MixAll import org.apache.rocketmq.common.message.{MessageExt, MessageQueue} import org.apache.rocketmq.spark._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.{DStream, DStreamCheckpointData, InputDStream} import org.apache.spark.streaming.scheduler.rate.RateEstimator import org.apache.spark.streaming.scheduler.{RateController, StreamInputInfo} import org.apache.spark.util.ThreadUtils import java.util.concurrent.atomic.AtomicReference import java.util.concurrent.{ConcurrentLinkedQueue, TimeUnit} import java.{lang => jl, util => ju} import scala.collection.JavaConverters._ import scala.collection.mutable /** * A DStream where * each given RocketMq topic/queueId corresponds to an RDD partition. * The configuration pull.max.speed.per.partition gives the maximum number * of messages per second that each '''partition''' will accept. * * @param groupId it is for rocketMq for identifying the consumer * @param topics the topics for the rocketmq * @param locationStrategy locationStrategy In most cases, pass in [[LocationStrategy.PreferConsistent]], * see [[LocationStrategy]] for more details. * @param consumerStrategy consumerStrategy In most cases, pass in [[ConsumerStrategy.lastest]], * see [[ConsumerStrategy]] for more details * @param autoCommit whether commit the offset to the rocketmq server automatically or not * @param forceSpecial Generally if the rocketmq server has checkpoint for the [[MessageQueue]], then the consumer * will consume from the checkpoint no matter we specify the offset or not. But if forceSpecial is true, * the rocketmq will start consuming from the specific available offset in any case. * @param failOnDataLoss Zero data lost is not guaranteed when topics are deleted. If zero data lost is critical, * the user must make sure all messages in a topic have been processed when deleting a topic. */ class MQPullInputDStream( _ssc: StreamingContext, groupId: String, topics: ju.Collection[jl.String], optionParams: ju.Map[String, String], locationStrategy: LocationStrategy, consumerStrategy: ConsumerStrategy, autoCommit: Boolean, forceSpecial: Boolean, failOnDataLoss: Boolean ) extends InputDStream[MessageExt](_ssc) with CanCommitOffsets{ private var currentOffsets = mutable.Map[TopicQueueId, Map[String, Long]]() private val commitQueue = new ConcurrentLinkedQueue[OffsetRange] private val commitCallback = new AtomicReference[OffsetCommitCallback] private val maxRateLimitPerPartition = optionParams.getOrDefault(RocketMQConfig.MAX_PULL_SPEED_PER_PARTITION, "-1").toInt @transient private var kc: DefaultMQPullConsumer = null /** * start up timer thread to persis the OffsetStore */ @transient private val scheduledExecutorService = ThreadUtils.newDaemonSingleThreadScheduledExecutor( "Driver-Commit-Thread") private def consumer() = this.synchronized { if (null == kc) { kc = RocketMqUtils.mkPullConsumerInstance(groupId, optionParams, "driver") val messageQueues = fetchSubscribeMessageQueues(topics) val iter = messageQueues.iterator while (iter.hasNext){ val messageQueue = iter.next val offset = computePullFromWhere(messageQueue) val topicQueueId = new TopicQueueId(messageQueue.getTopic, messageQueue.getQueueId) if (!currentOffsets.contains(topicQueueId)) { currentOffsets += topicQueueId -> Map(messageQueue.getBrokerName -> offset) } else { if (!currentOffsets(topicQueueId).contains(messageQueue.getBrokerName)){ currentOffsets(topicQueueId) += messageQueue.getBrokerName -> offset } } } // timer persist this.scheduledExecutorService.scheduleAtFixedRate( new Runnable() { def run() { try { kc.getOffsetStore.persistAll(fetchSubscribeMessageQueues(topics)) } catch { case e: Exception => { log.error("ScheduledTask persistAllConsumerOffset exception", e) } } } }, 1000 * 10, 1000 * 5, TimeUnit.MILLISECONDS) } kc } private def fetchSubscribeMessageQueues(topics : ju.Collection[jl.String]): ju.HashSet[MessageQueue] = { val messageQueueSet = new ju.HashSet[MessageQueue] val iter = topics.iterator while (iter.hasNext){ messageQueueSet.addAll(kc.fetchSubscribeMessageQueues(iter.next)) } messageQueueSet } private def computePullFromWhere(mq: MessageQueue): Long = { var result = -1L val offsetStore = kc.getOffsetStore val minOffset = kc.minOffset(mq) val checkpointOffset = offsetStore.readOffset(mq, ReadOffsetType.READ_FROM_STORE) consumerStrategy match { case LatestStrategy => { if (checkpointOffset >= 0) { //consider the checkpoint offset first if (checkpointOffset < minOffset) { reportDataLoss(s"MessageQueue $mq's checkpointOffset $checkpointOffset is smaller than minOffset $minOffset") result = kc.maxOffset(mq) } else { result = checkpointOffset } } else { // First start,no offset if (mq.getTopic.startsWith(MixAll.RETRY_GROUP_TOPIC_PREFIX)) { result = 0 } else { result = kc.maxOffset(mq) } } } case EarliestStrategy => { if (checkpointOffset >= 0) { //consider the checkpoint offset first if (checkpointOffset < minOffset) { reportDataLoss(s"MessageQueue $mq's checkpointOffset $checkpointOffset is smaller than minOffset $minOffset") result = minOffset } else { result = checkpointOffset } } else { // First start,no offset result = minOffset } } case SpecificOffsetStrategy(queueToOffset) => { val specificOffset = queueToOffset.get(mq) if (checkpointOffset >= 0 && !forceSpecial) { if (checkpointOffset < minOffset) { reportDataLoss(s"MessageQueue $mq's checkpointOffset $checkpointOffset is smaller than minOffset $minOffset") result = minOffset } else { result = checkpointOffset } } else { specificOffset match { case Some(ConsumerStrategy.LATEST) => { result = kc.maxOffset(mq) } case Some(ConsumerStrategy.EARLIEST) => { result = kc.minOffset(mq) } case Some(offset) => { if (offset < minOffset) { reportDataLoss(s"MessageQueue $mq's specific offset $offset is smaller than minOffset $minOffset") result = minOffset } else { result = offset } } case None => { if (checkpointOffset >= 0) { //consider the checkpoint offset first if (checkpointOffset < minOffset) { reportDataLoss(s"MessageQueue $mq's checkpointOffset $checkpointOffset is smaller than minOffset $minOffset") result = minOffset } else { result = checkpointOffset } } else { logWarning(s"MessageQueue $mq's specific offset and checkpointOffset are none, then use the minOffset") result = kc.minOffset(mq) } } } } } } result } private def firstConsumerOffset(mq: MessageQueue): Long = { val offsetStore = kc.getOffsetStore val lastOffset = offsetStore.readOffset(mq, ReadOffsetType.READ_FROM_STORE) val minOffset = kc.minOffset(mq) if (lastOffset < minOffset) { reportDataLoss(s"MessageQueue $mq's checkpoint offset $lastOffset is smaller than minOffset $minOffset") minOffset } else { lastOffset } } override def persist(newLevel: StorageLevel): DStream[MessageExt] = { logError("rocketmq MessageExt is not serializable. " + "Use .map to extract fields before calling .persist or .window") super.persist(newLevel) } protected def getPreferredHosts: ju.Map[TopicQueueId, String] = { locationStrategy match { case PreferConsistent => ju.Collections.emptyMap[TopicQueueId, String]() case PreferFixed(hostMap) => hostMap } } // Keep this consistent with how other streams are named (e.g. "Flume polling stream [2]") private[streaming] override def name: String = s"RocketMq polling stream [$id]" protected[streaming] override val checkpointData = new MQInputDStreamCheckpointData /** * Asynchronously maintains & sends new rate limits to the receiver through the receiver tracker. */ override protected[streaming] val rateController: Option[RateController] = { if (RateController.isBackPressureEnabled(_ssc.conf)) { Some(new DirectMQRateController(id, RateEstimator.create(_ssc.conf, context.graph.batchDuration))) } else { None } } /** * calculate the until-offset per partition in theory */ private def maxMessagesPerPartition( offsets: Map[TopicQueueId, Map[String, Long]]): Option[Map[TopicQueueId, Map[String, Long]]] = { val estimatedRateLimit = rateController.map(_.getLatestRate().toInt) var lagPerPartition = Map[TopicQueueId, Long]() var totalLag = 0L val lagPerPartitionPerQueue = offsets.map{ case (tp, value) => val partitionTotal = value.map{ case (name, maxOffset) => var count = Math.max(maxOffset - currentOffsets(tp)(name), 0) totalLag += count (name, count) } lagPerPartition += tp -> partitionTotal.values.sum tp -> partitionTotal } val effectiveRateLimitPerPartition = estimatedRateLimit.filter(_ > 0) match { case Some(rate) => lagPerPartitionPerQueue.map { case (tp, queues) => val backPressRate = Math.round(lagPerPartition(tp) / totalLag.toFloat * rate) val partitionMessages = (if (maxRateLimitPerPartition > 0) { Math.min(backPressRate, maxRateLimitPerPartition)} else backPressRate) tp -> queues.map{ case (name, count) => (name, Math.ceil(count / lagPerPartition(tp).toFloat * partitionMessages)) } } case None => lagPerPartitionPerQueue.map { case (tp, queues) => val partitionMessages = maxRateLimitPerPartition tp -> queues.map{ case (name, count) => (name, Math.ceil(count / lagPerPartition(tp).toFloat * partitionMessages)) } } } if (effectiveRateLimitPerPartition.flatMap(_._2).map(_._2).sum > 0) { val secsPerBatch = context.graph.batchDuration.milliseconds.toDouble / 1000 Some(effectiveRateLimitPerPartition.map { case (tp, limit) => tp -> limit.map{ case (name, count) => name -> (count * secsPerBatch).toLong } }) } else { None } } /** * Returns the latest (highest) available offsets, taking new partitions into account. */ protected def latestOffsets(): Map[TopicQueueId, Map[String, Long]] = { val c = consumer val messageQueues = fetchSubscribeMessageQueues(topics) var maxOffsets = Map[TopicQueueId, Map[String, Long]]() val lastTopicQueues = currentOffsets.keySet val fetchTopicQueues = mutable.Set[TopicQueueId]() val iter = messageQueues.iterator while (iter.hasNext) { val messageQueue = iter.next logDebug(s"${messageQueue.toString} min: ${c.minOffset(messageQueue)} max: ${c.maxOffset(messageQueue)}") val topicQueueId = new TopicQueueId(messageQueue.getTopic, messageQueue.getQueueId) fetchTopicQueues.add(topicQueueId) if (!currentOffsets.contains(topicQueueId)){ currentOffsets += topicQueueId -> Map(messageQueue.getBrokerName -> firstConsumerOffset(messageQueue)) }else{ if (!currentOffsets(topicQueueId).contains(messageQueue.getBrokerName)) currentOffsets(topicQueueId) += messageQueue.getBrokerName -> firstConsumerOffset(messageQueue) } if (!maxOffsets.contains(topicQueueId)) { maxOffsets += topicQueueId -> Map(messageQueue.getBrokerName -> c.maxOffset(messageQueue)) }else{ if (!maxOffsets(topicQueueId).contains(messageQueue.getBrokerName)) { val tempMap = maxOffsets(topicQueueId) + (messageQueue.getBrokerName -> c.maxOffset(messageQueue)) maxOffsets += topicQueueId -> tempMap } } } val deletedPartitions = lastTopicQueues.diff(fetchTopicQueues) if (deletedPartitions.size > 0){ reportDataLoss( s"Cannot find offsets of ${deletedPartitions}. Some data may have been missed") } maxOffsets } /** * limits the maximum number of messages per partition */ protected def clamp(offsets: Map[TopicQueueId, Map[String, Long]]): Map[TopicQueueId, Map[String, Long]] = { maxMessagesPerPartition(offsets).map { mmp => mmp.map { case (tp, partitionsOffsets) => tp -> partitionsOffsets.map{case (name, messages) => name -> Math.min(currentOffsets(tp)(name) + messages, offsets(tp)(name))} } }.getOrElse(offsets) } override def compute(validTime: Time): Option[RocketMqRDD] = { val untilOffsets = clamp(latestOffsets()) val offsetRangeRdd: ju.Map[TopicQueueId, Array[OffsetRange]] = new ju.HashMap() untilOffsets.foreach { case (tp, uo) => val values = uo.map { case (name, until) => val fo = currentOffsets(tp)(name) OffsetRange(tp.topic, tp.queueId, name, fo, until) }.filter(item => { item.count() > 0 }).toArray if (values != null && values.length > 0) { offsetRangeRdd.put(tp, values) } } val rdd = new RocketMqRDD( context.sparkContext, groupId, optionParams, offsetRangeRdd, getPreferredHosts, true) // Report the record number and metadata of this batch interval to InputInfoTracker. val description = offsetRangeRdd.asScala.flatMap { case (tp, arrayRange) => // Don't display empty ranges. arrayRange }.filter { offsetRange => offsetRange.fromOffset != offsetRange.untilOffset }.map { offsetRange => s"topic: ${offsetRange.topic}\tqueueId: ${offsetRange.queueId}\t" + s"brokerName: ${offsetRange.brokerName}\t" + s"offsets: ${offsetRange.fromOffset} to ${offsetRange.untilOffset}" }.mkString("\n") // Copy offsetRanges to immutable.List to prevent from being modified by the user val metadata = Map( "offsets" -> offsetRangeRdd, StreamInputInfo.METADATA_KEY_DESCRIPTION -> description) val inputInfo = StreamInputInfo(id, rdd.count, metadata) ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo) currentOffsets = collection.mutable.Map() ++ untilOffsets if (autoCommit) { currentOffsets.foreach { case (tp, uo) => uo.map { case (name, until) => val offset = currentOffsets(tp)(name) - 1 val mq = new MessageQueue(tp.topic, name, tp.queueId) kc.updateConsumeOffset(mq, offset) } } } else { commitAll() } Some(rdd) } private def reportDataLoss(message: String): Unit = { if (failOnDataLoss) { throw new IllegalStateException(message) } else { logWarning(message) } } /** * Queue up offset ranges for commit to rocketmq at a future time. Threadsafe. * * @param offsetRanges The maximum untilOffset for a given partition will be used at commit. */ def commitAsync(offsetRanges: ju.Map[TopicQueueId, Array[OffsetRange]]): Unit = { commitAsync(offsetRanges, null) } /** * Queue up offset ranges for commit to rocketmq at a future time. Threadsafe. * * @param offsetRanges The maximum untilOffset for a given partition will be used at commit. * @param callback Only the most recently provided callback will be used at commit. */ def commitAsync(offsetRanges: ju.Map[TopicQueueId, Array[OffsetRange]], callback: OffsetCommitCallback): Unit = { commitCallback.set(callback) offsetRanges.values.asScala.foreach{ value => commitQueue.addAll(ju.Arrays.asList(value: _*)) } } protected def commitAll(): Unit = { val m = new ju.HashMap[MessageQueue, jl.Long] var osr = commitQueue.poll() try { while (null != osr) { //Exclusive ending offset val mq = new MessageQueue(osr.topic, osr.brokerName, osr.queueId) kc.updateConsumeOffset(mq, osr.untilOffset - 1) m.put(mq, osr.untilOffset - 1) osr = commitQueue.poll() } if (commitCallback.get != null) { commitCallback.get.onComplete(m, null) } } catch { case e: Exception => { if (commitCallback.get != null) commitCallback.get.onComplete(m, e) } } } override def start(): Unit = { consumer } override def stop(): Unit = this.synchronized { if (kc != null) { kc.shutdown() } } private[streaming] class MQInputDStreamCheckpointData extends DStreamCheckpointData(this) { def batchForTime: mutable.HashMap[Time, mutable.HashMap[TopicQueueId, Array[(String, Int, String, Long, Long)]]] = { data.asInstanceOf[mutable.HashMap[Time, mutable.HashMap[TopicQueueId, Array[OffsetRange.OffsetRangeTuple]]]] } override def update(time: Time): Unit = { batchForTime.clear() generatedRDDs.foreach { kv => val values = new mutable.HashMap[TopicQueueId, Array[OffsetRange.OffsetRangeTuple]] kv._2.asInstanceOf[RocketMqRDD].offsetRanges.asScala.foreach{ case (k, v) => values.put(k, v.map(_.toTuple)) } batchForTime += kv._1 ->values } } override def cleanup(time: Time): Unit = { } override def restore(): Unit = { batchForTime.toSeq.sortBy(_._1)(Time.ordering).foreach { case (t, b) => logInfo(s"Restoring RocketMqRDD for time $t $b") val offsetRanges = new ju.HashMap[TopicQueueId, Array[OffsetRange]]() b.foreach{ case (i, j) => offsetRanges.put(i, j.map(OffsetRange(_))) } generatedRDDs += t -> new RocketMqRDD( context.sparkContext, groupId, optionParams, offsetRanges, getPreferredHosts, // during restore, it's possible same partition will be consumed from multiple // threads, so dont use cache false ) } } } /** * A RateController to retrieve the rate from RateEstimator. */ private class DirectMQRateController(id: Int, estimator: RateEstimator) extends RateController(id, estimator) { override def publish(rate: Long): Unit = () } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-2.3/org/apache/spark/sql/rocketmq/RocketMQSource.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSource.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ * 2. Schema of output dataframe adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import java.io._ import java.nio.charset.StandardCharsets import java.{util => ju} import org.apache.commons.io.IOUtils import org.apache.rocketmq.common.message.MessageQueue import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.rocketmq.RocketMQSource._ import org.apache.spark.sql.types.{StructField, _} import org.apache.spark.unsafe.types.UTF8String /** * A [[Source]] that reads data from RocketMQ using the following design. * * - The [[RocketMQSourceOffset]] is the custom [[Offset]] defined for this source that contains * a map of MessageQueue -> offset. Note that this offset is 1 + (available offset). For * example if the last record in a RocketMQ topic "t", partition 2 is offset 5, then * RocketMQSourceOffset will contain MessageQueue("t", 2) -> 6. This is done keep it consistent * with the semantics of `MQPullConsumer.fetchConsumeOffset()`. * * - The [[RocketMQSource]] written to do the following. * * - As soon as the source is created, the pre-configured [[RocketMQOffsetReader]] * is used to query the initial offsets that this source should * start reading from. This is used to create the first batch. * * - `getOffset()` uses the [[RocketMQOffsetReader]] to query the latest * available offsets, which are returned as a [[RocketMQSourceOffset]]. * * - `getBatch()` returns a DF that reads from the 'start offset' until the 'end offset' in * for each partition. The end offset is excluded to be consistent with the semantics of * [[RocketMQSourceOffset]] and `MQPullConsumer.fetchConsumeOffset()`. * * - The DF returned is based on [[RocketMQSourceRDD]] which is constructed such that the * data from RocketMQ topic + partition is consistently read by the same executors across * batches, and cached RocketMQConsumers in the executors can be reused efficiently. See the * docs on [[RocketMQSourceRDD]] for more details. * * Zero data lost is not guaranteed when topics are deleted. If zero data lost is critical, the user * must make sure all messages in a topic have been processed when deleting a topic. */ private class RocketMQSource( sqlContext: SQLContext, offsetReader: RocketMQOffsetReader, executorRocketMQParams: ju.Map[String, String], sourceOptions: Map[String, String], metadataPath: String, startingOffsets: RocketMQOffsetRangeLimit, failOnDataLoss: Boolean) extends Source with Logging { private val sc = sqlContext.sparkContext private val pollTimeoutMs = sourceOptions.getOrElse( RocketMQConf.PULL_TIMEOUT_MS, sc.conf.getTimeAsMs("spark.network.timeout", "120s").toString ).toLong private val maxOffsetsPerTrigger = sourceOptions.get("maxOffsetsPerTrigger").map(_.toLong) /** * Lazily initialize `initialPartitionOffsets` to make sure that `RocketMQConsumer.pull` is only * called in StreamExecutionThread. */ private lazy val initialPartitionOffsets = { val metadataLog = new HDFSMetadataLog[RocketMQSourceOffset](sqlContext.sparkSession, metadataPath) { override def serialize(metadata: RocketMQSourceOffset, out: OutputStream): Unit = { out.write(0) // A zero byte is written to support Spark 2.1.0 (SPARK-19517) val writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8)) writer.write("v" + VERSION + "\n") writer.write(metadata.json) writer.flush() } override def deserialize(in: InputStream): RocketMQSourceOffset = { in.read() // A zero byte is read to support Spark 2.1.0 (SPARK-19517) val content = IOUtils.toString(new InputStreamReader(in, StandardCharsets.UTF_8)) // HDFSMetadataLog guarantees that it never creates a partial file. assert(content.length != 0) if (content(0) == 'v') { val indexOfNewLine = content.indexOf("\n") if (indexOfNewLine > 0) { val version = parseVersion(content.substring(0, indexOfNewLine), VERSION) RocketMQSourceOffset(SerializedOffset(content.substring(indexOfNewLine + 1))) } else { throw new IllegalStateException( s"Log file was malformed: failed to detect the log file version line.") } } else { // The log was generated by Spark 2.1.0 RocketMQSourceOffset(SerializedOffset(content)) } } } metadataLog.get(0).getOrElse { val offsets = startingOffsets match { case EarliestOffsetRangeLimit => RocketMQSourceOffset(offsetReader.fetchEarliestOffsets()) case LatestOffsetRangeLimit => RocketMQSourceOffset(offsetReader.fetchLatestOffsets()) case SpecificOffsetRangeLimit(p) => offsetReader.fetchSpecificOffsets(p, reportDataLoss) } metadataLog.add(0, offsets) logInfo(s"Initial offsets: $offsets") offsets }.queueToOffsets } private var currentPartitionOffsets: Option[Map[MessageQueue, Long]] = None override def schema: StructType = RocketMQSource.schema /** Returns the maximum available offset for this source. */ override def getOffset: Option[Offset] = { // Make sure initialPartitionOffsets is initialized initialPartitionOffsets val latest = offsetReader.fetchLatestOffsets() val offsets = maxOffsetsPerTrigger match { case None => latest case Some(limit) if currentPartitionOffsets.isEmpty => rateLimit(limit, initialPartitionOffsets, latest) case Some(limit) => rateLimit(limit, currentPartitionOffsets.get, latest) } currentPartitionOffsets = Some(offsets) logDebug(s"GetOffset: ${offsets.toSeq.map(_.toString).sorted}") Some(RocketMQSourceOffset(offsets)) } /** Proportionally distribute limit number of offsets among message queues */ private def rateLimit( limit: Long, from: Map[MessageQueue, Long], until: Map[MessageQueue, Long]): Map[MessageQueue, Long] = { val fromNew = offsetReader.fetchEarliestOffsets(until.keySet.diff(from.keySet).toSeq) val sizes = until.flatMap { case (tp, end) => // If begin isn't defined, something's wrong, but let alert logic in getBatch handle it from.get(tp).orElse(fromNew.get(tp)).flatMap { begin => val size = end - begin logDebug(s"rateLimit $tp size is $size") if (size > 0) Some(tp -> size) else None } } val total = sizes.values.sum.toDouble if (total < 1) { until } else { until.map { case (tp, end) => tp -> sizes.get(tp).map { size => val begin = from.get(tp).getOrElse(fromNew(tp)) val prorate = limit * (size / total) logDebug(s"rateLimit $tp prorated amount is $prorate") // Don't completely starve small topicpartitions val off = begin + (if (prorate < 1) Math.ceil(prorate) else Math.floor(prorate)).toLong logDebug(s"rateLimit $tp new offset is $off") // Paranoia, make sure not to return an offset that's past end Math.min(end, off) }.getOrElse(end) } } } /** * Returns the data that is between the offsets * [`start.get.partitionToOffsets`, `end.partitionToOffsets`), i.e. end.partitionToOffsets is * exclusive. */ override def getBatch(start: Option[Offset], end: Offset): DataFrame = { // Make sure initialPartitionOffsets is initialized initialPartitionOffsets logInfo(s"GetBatch called with start = $start, end = $end") val untilPartitionOffsets = RocketMQSourceOffset.getPartitionOffsets(end) // On recovery, getBatch will get called before getOffset if (currentPartitionOffsets.isEmpty) { currentPartitionOffsets = Some(untilPartitionOffsets) } if (start.isDefined && start.get == end) { return sqlContext.internalCreateDataFrame( sqlContext.sparkContext.emptyRDD, schema, isStreaming = true) } val fromPartitionOffsets = start match { case Some(prevBatchEndOffset) => RocketMQSourceOffset.getPartitionOffsets(prevBatchEndOffset) case None => initialPartitionOffsets } // Find the new partitions, and get their earliest offsets val newPartitions = untilPartitionOffsets.keySet.diff(fromPartitionOffsets.keySet) val newPartitionOffsets = offsetReader.fetchEarliestOffsets(newPartitions.toSeq) if (newPartitionOffsets.keySet != newPartitions) { // We cannot get from offsets for some partitions. It means they got deleted. val deletedPartitions = newPartitions.diff(newPartitionOffsets.keySet) reportDataLoss( s"Cannot find earliest offsets of $deletedPartitions. Some data may have been missed") } logInfo(s"Partitions added: $newPartitionOffsets") newPartitionOffsets.filter(_._2 != 0).foreach { case (p, o) => reportDataLoss( s"Added partition $p starts from $o instead of 0. Some data may have been missed") } val deletedPartitions = fromPartitionOffsets.keySet.diff(untilPartitionOffsets.keySet) if (deletedPartitions.nonEmpty) { reportDataLoss(s"$deletedPartitions are gone. Some data may have been missed") } // Use the until partitions to calculate offset ranges to ignore partitions that have // been deleted val topicPartitions = untilPartitionOffsets.keySet.filter { tp => // Ignore partitions that we don't know the from offsets. newPartitionOffsets.contains(tp) || fromPartitionOffsets.contains(tp) }.toSeq logDebug("TopicPartitions: " + topicPartitions.mkString(", ")) val sortedExecutors = getSortedExecutorList(sc) val numExecutors = sortedExecutors.length logDebug("Sorted executors: " + sortedExecutors.mkString(", ")) // Calculate offset ranges val offsetRanges = topicPartitions.map { tp => val fromOffset = fromPartitionOffsets.getOrElse(tp, { newPartitionOffsets.getOrElse(tp, { // This should not happen since newPartitionOffsets contains all partitions not in // fromPartitionOffsets throw new IllegalStateException(s"$tp doesn't have a from offset") }) }) val untilOffset = untilPartitionOffsets(tp) val preferredLoc = if (numExecutors > 0) { // This allows cached RocketMQConsumers in the executors to be re-used to read the same // partition in every batch. Some(sortedExecutors(Math.floorMod(tp.hashCode, numExecutors))) } else None RocketMQSourceRDDOffsetRange(tp, fromOffset, untilOffset, preferredLoc) }.filter { range => if (range.untilOffset < range.fromOffset) { reportDataLoss(s"Partition ${range.messageQueue}'s offset was changed from " + s"${range.fromOffset} to ${range.untilOffset}, some data may have been missed") false } else { true } }.toArray // Create an RDD that reads from RocketMQ and get the (key, value) pair as byte arrays. val rdd = new RocketMQSourceRDD( sc, executorRocketMQParams, offsetRanges, pollTimeoutMs, failOnDataLoss, reuseRocketMQConsumer = true).map { cr => // Remove the `brokerName` property which was added by us. See `RocketMQSourceRDD.compute` val brokerName = cr.getProperties.remove(RocketMQSource.PROP_BROKER_NAME) InternalRow( UTF8String.fromString(cr.getTopic), // topic cr.getFlag, // flag cr.getBody, // body UTF8String.fromString(JsonUtils.messageProperties(cr.getProperties)), // properties UTF8String.fromString(brokerName), // brokerName cr.getQueueId, // queueId cr.getQueueOffset, // queueOffset DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.getBornTimestamp)), // bornTimestamp DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.getStoreTimestamp)) // storeTimestamp ) } logInfo("GetBatch generating RDD of offset range: " + offsetRanges.sortBy(_.messageQueue.toString).mkString(", ")) sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true) } /** Stop this source and free any resources it has allocated. */ override def stop(): Unit = synchronized { offsetReader.close() } override def toString: String = s"RocketMQSource[$offsetReader]" /** * If `failOnDataLoss` is true, this method will throw an `IllegalStateException`. * Otherwise, just log a warning. */ private def reportDataLoss(message: String): Unit = { if (failOnDataLoss) { throw new IllegalStateException(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE") } else { logWarning(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE") } } } /** Companion object for the [[RocketMQSource]]. */ private object RocketMQSource { val INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE = """ |Some data may have been lost because they are not available in RocketMQ any more; either the | data was aged out by RocketMQ or the topic may have been deleted before all the data in the | topic was processed. If you want your streaming query to fail on such cases, set the source | option "failOnDataLoss" to "true". """.stripMargin val INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE = """ |Some data may have been lost because they are not available in RocketMQ any more; either the | data was aged out by RocketMQ or the topic may have been deleted before all the data in the | topic was processed. If you don't want your streaming query to fail on such cases, set the | source option "failOnDataLoss" to "false". """.stripMargin val VERSION = 1 val PROP_BROKER_NAME = "_brokerName" def getSortedExecutorList(sc: SparkContext): Array[String] = { val bm = sc.env.blockManager bm.master.getPeers(bm.blockManagerId).toArray .map(x => ExecutorCacheTaskLocation(x.host, x.executorId)) .sortWith(compare) .map(_.toString) } private def compare(a: ExecutorCacheTaskLocation, b: ExecutorCacheTaskLocation): Boolean = { if (a.host == b.host) { a.executorId > b.executorId } else { a.host > b.host } } def schema: StructType = StructType(Seq( // fields of `Message` StructField("topic", StringType), StructField("flag", IntegerType), StructField("body", BinaryType), StructField("properties", StringType), // fields of `MessageExt` StructField("brokerName", StringType), StructField("queueId", IntegerType), StructField("queueOffset", LongType), StructField("bornTimestamp", TimestampType), StructField("storeTimestamp", TimestampType) )) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-2.3/org/apache/spark/sql/rocketmq/RocketMQSourceOffset.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSourceOffset.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import org.apache.rocketmq.common.message.MessageQueue import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset} import org.apache.spark.sql.sources.v2.reader.streaming.{PartitionOffset, Offset => OffsetV2} /** * An [[Offset]] for the [[RocketMQSource]]. This one tracks all partitions of subscribed topics and * their offsets. */ private[rocketmq] case class RocketMQSourceOffset(queueToOffsets: Map[MessageQueue, Long]) extends OffsetV2 { override val json = JsonUtils.partitionOffsets(queueToOffsets) } private[rocketmq] case class RocketMQSourcePartitionOffset(messageQueue: MessageQueue, queueOffset: Long) extends PartitionOffset /** Companion object of the [[RocketMQSourceOffset]] */ private[rocketmq] object RocketMQSourceOffset { def getPartitionOffsets(offset: Offset): Map[MessageQueue, Long] = { offset match { case o: RocketMQSourceOffset => o.queueToOffsets case so: SerializedOffset => RocketMQSourceOffset(so).queueToOffsets case _ => throw new IllegalArgumentException( s"Invalid conversion from offset of ${offset.getClass} to RocketMQSourceOffset") } } /** * Returns [[RocketMQSourceOffset]] from a variable sequence of (topic, brokerName, queueId, offset) * tuples. */ def apply(offsetTuples: (String, String, Int, Long)*): RocketMQSourceOffset = { RocketMQSourceOffset(offsetTuples.map { case(t, b, q, o) => (new MessageQueue(t, b, q), o) }.toMap) } /** * Returns [[RocketMQSourceOffset]] from a JSON [[SerializedOffset]] */ def apply(offset: SerializedOffset): RocketMQSourceOffset = RocketMQSourceOffset(JsonUtils.partitionOffsets(offset.json)) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-2.3/org/apache/spark/sql/rocketmq/RocketMQSourceRDDOffsetRange.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSourceRDD.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import java.{util => ju} import org.apache.rocketmq.common.message.{MessageExt, MessageQueue} import org.apache.spark.partial.{BoundedDouble, PartialResult} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.util.NextIterator import org.apache.spark.{Partition, SparkContext, TaskContext} import scala.collection.mutable.ArrayBuffer /** Offset range that one partition of the RocketMQSourceRDD has to read */ private[rocketmq] case class RocketMQSourceRDDOffsetRange( messageQueue: MessageQueue, fromOffset: Long, untilOffset: Long, preferredLoc: Option[String]) { def size: Long = untilOffset - fromOffset } /** Partition of the RocketMQSourceRDD */ private[rocketmq] case class RocketMQSourceRDDPartition(index: Int, offsetRange: RocketMQSourceRDDOffsetRange) extends Partition /** * An RDD that reads data from RocketMQ based on offset ranges across multiple partitions. * Additionally, it allows preferred locations to be set for each topic + partition, so that * the [[RocketMQSource]] can ensure the same executor always reads the same topic + partition * and cached RocketMQConsumers (see [[CachedRocketMQConsumer]] can be used read data efficiently. * * @param sc the [[SparkContext]] * @param executorRocketMQParams RocketMQ configuration for creating RocketMQConsumer on the executors * @param offsetRanges Offset ranges that define the RocketMQ data belonging to this RDD */ private[rocketmq] class RocketMQSourceRDD( sc: SparkContext, executorRocketMQParams: ju.Map[String, String], offsetRanges: Seq[RocketMQSourceRDDOffsetRange], pollTimeoutMs: Long, failOnDataLoss: Boolean, reuseRocketMQConsumer: Boolean) extends RDD[MessageExt](sc, Nil) { override def persist(newLevel: StorageLevel): this.type = { logError("RocketMQ ConsumerRecord is not serializable. " + "Use .map to extract fields before calling .persist or .window") super.persist(newLevel) } override def getPartitions: Array[Partition] = { offsetRanges.zipWithIndex.map { case (o, i) => RocketMQSourceRDDPartition(i, o) }.toArray } override def count(): Long = offsetRanges.map(_.size).sum override def countApprox(timeout: Long, confidence: Double): PartialResult[BoundedDouble] = { val c = count() new PartialResult(new BoundedDouble(c, 1.0, c, c), true) } override def isEmpty(): Boolean = count == 0L override def take(num: Int): Array[MessageExt] = { val nonEmptyPartitions = this.partitions .map(_.asInstanceOf[RocketMQSourceRDDPartition]) .filter(_.offsetRange.size > 0) if (num < 1 || nonEmptyPartitions.isEmpty) { return new Array[MessageExt](0) } // Determine in advance how many messages need to be taken from each partition val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) => val remain = num - result.values.sum if (remain > 0) { val taken = Math.min(remain, part.offsetRange.size) result + (part.index -> taken.toInt) } else { result } } val buf = new ArrayBuffer[MessageExt] val res = context.runJob( this, (tc: TaskContext, it: Iterator[MessageExt]) => it.take(parts(tc.partitionId())).toArray, parts.keys.toArray ) res.foreach(buf ++= _) buf.toArray } override def getPreferredLocations(split: Partition): Seq[String] = { val part = split.asInstanceOf[RocketMQSourceRDDPartition] part.offsetRange.preferredLoc.map(Seq(_)).getOrElse(Seq.empty) } override def compute( thePart: Partition, context: TaskContext): Iterator[MessageExt] = { val sourcePartition = thePart.asInstanceOf[RocketMQSourceRDDPartition] val consumer = if (!reuseRocketMQConsumer) { CachedRocketMQConsumer.getOrCreate(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } else { CachedRocketMQConsumer.createUncached(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } val range = resolveRange(consumer, sourcePartition.offsetRange) assert( range.fromOffset <= range.untilOffset, s"Beginning offset ${range.fromOffset} is after the ending offset ${range.untilOffset} for " + s"${range.messageQueue}. You either provided an invalid fromOffset, or the RocketMQ topic has been damaged") if (range.fromOffset == range.untilOffset) { logInfo(s"Beginning offset ${range.fromOffset} is the same as ending offset, " + s"skipping ${range.messageQueue}") Iterator.empty } else { val underlying = new NextIterator[MessageExt]() { private var requestOffset = range.fromOffset override def getNext(): MessageExt = { if (requestOffset >= range.untilOffset) { // Processed all offsets in this partition. finished = true null } else { val r = consumer.get(requestOffset, range.untilOffset, pollTimeoutMs, failOnDataLoss) if (r == null) { // Losing some data. Skip the rest offsets in this partition. finished = true null } else { requestOffset = r.getQueueOffset + 1 // The MessageExt structure does not contains any field of `brokerName`, so put one into properties r.putUserProperty(RocketMQSource.PROP_BROKER_NAME, sourcePartition.offsetRange.messageQueue.getBrokerName) r } } } override protected def close(): Unit = { if (!reuseRocketMQConsumer) { consumer.close() } else { CachedRocketMQConsumer.releaseConsumer(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } } } // Release consumer, either by removing it or indicating we're no longer using it context.addTaskCompletionListener { _ => underlying.closeIfNeeded() } underlying } } /** * Resolve the EARLIEST/LATEST placeholder in range * @return the range with actual boundary */ private def resolveRange(consumer: CachedRocketMQConsumer, range: RocketMQSourceRDDOffsetRange) = { if (range.fromOffset < 0 || range.untilOffset < 0) { // Late bind the offset range val availableOffsetRange = consumer.getAvailableOffsetRange() val fromOffset = if (range.fromOffset < 0) { assert(range.fromOffset == RocketMQOffsetRangeLimit.EARLIEST, s"earliest offset ${range.fromOffset} does not equal ${RocketMQOffsetRangeLimit.EARLIEST}") availableOffsetRange.earliest } else { range.fromOffset } val untilOffset = if (range.untilOffset < 0) { assert(range.untilOffset == RocketMQOffsetRangeLimit.LATEST, s"latest offset ${range.untilOffset} does not equal ${RocketMQOffsetRangeLimit.LATEST}") availableOffsetRange.latest } else { range.untilOffset } RocketMQSourceRDDOffsetRange(range.messageQueue, fromOffset, untilOffset, range.preferredLoc) } else { range } } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-2.3/org.apache.spark.streaming/RocketMqRDD.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.streaming import org.apache.rocketmq.common.message.MessageExt import org.apache.rocketmq.spark._ import org.apache.spark.partial.{BoundedDouble, PartialResult} import org.apache.spark.rdd.RDD import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.storage.StorageLevel import org.apache.spark.{Partition, SparkContext, TaskContext} import java.{util => ju} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer /** * A batch-oriented interface for consuming from RocketMq. * Starting and ending offsets are specified in advance, * so that you can control exactly-once semantics. * @param groupId it is for rocketMq for identifying the consumer * @param optionParams the configs * @param offsetRanges offset ranges that define the RocketMq data belonging to this RDD * @param preferredHosts map from TopicQueueId to preferred host for processing that partition. * In most cases, use [[LocationStrategy.PreferConsistent]] * @param useConsumerCache useConsumerCache whether to use a consumer from a per-jvm cache */ class RocketMqRDD ( sc: SparkContext, val groupId: String, val optionParams: ju.Map[String, String], val offsetRanges: ju.Map[TopicQueueId, Array[OffsetRange]], val preferredHosts: ju.Map[TopicQueueId, String], val useConsumerCache: Boolean )extends RDD[MessageExt](sc, Nil) with HasOffsetRanges{ private val cacheInitialCapacity = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_INIT_CAPACITY, "16").toInt private val cacheMaxCapacity = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_MAX_CAPACITY, "64").toInt private val cacheLoadFactor = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_LOAD_FACTOR, "0.75").toFloat override def persist(newLevel: StorageLevel): this.type = { super.persist(newLevel) } override def getPartitions: Array[Partition] = { offsetRanges.asScala.toArray.zipWithIndex.map{ case ((first, second), i) => new RocketMqRDDPartition(i, first.topic, first.queueId, second) }.toArray } override def count(): Long = offsetRanges.asScala.map(_._2.map(_.count).sum).sum override def countApprox( timeout: Long, confidence: Double = 0.95 ): PartialResult[BoundedDouble] = { val c = count new PartialResult(new BoundedDouble(c, 1.0, c, c), true) } override def isEmpty(): Boolean = count == 0L override def take(num: Int): Array[MessageExt] = { val nonEmptyPartitions = this.partitions .map(_.asInstanceOf[RocketMqRDDPartition]) .filter(_.count > 0) if (num < 1 || nonEmptyPartitions.isEmpty) { return new Array[MessageExt](0) } // Determine in advance how many messages need to be taken from each partition val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) => val remain = num - result.values.sum if (remain > 0) { val taken = Math.min(remain, part.count) result + (part.index -> taken.toInt) } else { result } } val buf = new ArrayBuffer[MessageExt] val res = context.runJob( this, (tc: TaskContext, it: Iterator[MessageExt]) => it.take(parts(tc.partitionId)).toArray, parts.keys.toArray ) res.foreach(buf ++= _) buf.toArray } private def executors(): Array[ExecutorCacheTaskLocation] = { val bm = sparkContext.env.blockManager bm.master.getPeers(bm.blockManagerId).toArray .map(x => ExecutorCacheTaskLocation(x.host, x.executorId)) .sortWith(compareExecutors) } private def compareExecutors( a: ExecutorCacheTaskLocation, b: ExecutorCacheTaskLocation): Boolean = if (a.host == b.host) { a.executorId > b.executorId } else { a.host > b.host } /** * Non-negative modulus, from java 8 math */ private def floorMod(a: Int, b: Int): Int = ((a % b) + b) % b protected override def getPreferredLocations(thePart: Partition): Seq[String] = { // The intention is best-effort consistent executor for a given topic partition, // so that caching consumers can be effective. val part = thePart.asInstanceOf[RocketMqRDDPartition] val allExecs = executors() val tp = part.topicQueueId() val prefHost = preferredHosts.get(tp) val prefExecs = if (null == prefHost) allExecs else allExecs.filter(_.host == prefHost) val execs = if (prefExecs.isEmpty) allExecs else prefExecs if (execs.isEmpty) { Seq() } else { // execs is sorted, tp.hashCode depends only on topic and partition, so consistent index val index = this.floorMod(tp.hashCode, execs.length) val chosen = execs(index) Seq(chosen.toString) } } private def errBeginAfterEnd(part: RocketMqRDDPartition): String = s"Beginning offset is after the ending offset ${part.partitionOffsetRanges.mkString(",")} " + s"for topic ${part.topic} partition ${part.index}. " + "You either provided an invalid fromOffset, or the Kafka topic has been damaged" override def compute(thePart: Partition, context: TaskContext): Iterator[MessageExt] = { val part = thePart.asInstanceOf[RocketMqRDDPartition] val count = part.count() assert(count >= 0, errBeginAfterEnd(part)) if (count == 0) { logInfo(s"Beginning offset is the same as ending offset " + s"skipping ${part.topic} ${part.queueId}") Iterator.empty } else { new RocketMqRDDIterator(part, context) } } /** * An iterator that fetches messages directly from rocketmq for the offsets in partition. * Uses a cached consumer where possible to take advantage of prefetching */ private class RocketMqRDDIterator( part: RocketMqRDDPartition, context: TaskContext) extends Iterator[MessageExt] { logDebug(s"Computing topic ${part.topic}, queueId ${part.queueId} " + s"offsets ${part.partitionOffsetRanges.mkString(",")}") context.addTaskCompletionListener{ context => closeIfNeeded() } val consumer = if (useConsumerCache) { CachedMQConsumer.init(cacheInitialCapacity, cacheMaxCapacity, cacheLoadFactor) if (context.attemptNumber > 5) { // just in case the prior attempt failures were cache related CachedMQConsumer.remove(groupId, part.topic, part.queueId, part.brokerNames) } CachedMQConsumer.getOrCreate(groupId, part.topic, part.queueId, part.brokerNames, optionParams) } else { CachedMQConsumer.getUncached(groupId, part.topic, part.queueId, part.brokerNames, optionParams) } var logicTotalOffset = 0 val totalSum = part.partitionOffsetRanges.map(_.count).sum var index = 0 var requestOffset = part.partitionOffsetRanges.apply(index).fromOffset def closeIfNeeded(): Unit = { if (!useConsumerCache && consumer != null) { consumer.client.shutdown } } override def hasNext(): Boolean = { totalSum > logicTotalOffset } override def next(): MessageExt = { assert(hasNext(), "Can't call getNext() once untilOffset has been reached") val queueRange = part.partitionOffsetRanges.apply(index) val r = consumer.get(queueRange.brokerName, requestOffset) if (queueRange.untilOffset > (requestOffset + 1)) requestOffset +=1 else { index +=1 if (part.partitionOffsetRanges.length > index) requestOffset = part.partitionOffsetRanges.apply(index).fromOffset } logicTotalOffset += 1 r } } private[RocketMqRDD] type OffsetRangeTuple = (String, Int) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-2.4/org/apache/spark/sql/rocketmq/RocketMQSource.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSource.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ * 2. Schema of output dataframe adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import java.io._ import java.nio.charset.StandardCharsets import java.{util => ju} import org.apache.commons.io.IOUtils import org.apache.rocketmq.common.message.MessageQueue import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.rocketmq.RocketMQSource._ import org.apache.spark.sql.types.{StructField, _} import org.apache.spark.unsafe.types.UTF8String /** * A [[Source]] that reads data from RocketMQ using the following design. * * - The [[RocketMQSourceOffset]] is the custom [[Offset]] defined for this source that contains * a map of MessageQueue -> offset. Note that this offset is 1 + (available offset). For * example if the last record in a RocketMQ topic "t", partition 2 is offset 5, then * RocketMQSourceOffset will contain MessageQueue("t", 2) -> 6. This is done keep it consistent * with the semantics of `MQPullConsumer.fetchConsumeOffset()`. * * - The [[RocketMQSource]] written to do the following. * * - As soon as the source is created, the pre-configured [[RocketMQOffsetReader]] * is used to query the initial offsets that this source should * start reading from. This is used to create the first batch. * * - `getOffset()` uses the [[RocketMQOffsetReader]] to query the latest * available offsets, which are returned as a [[RocketMQSourceOffset]]. * * - `getBatch()` returns a DF that reads from the 'start offset' until the 'end offset' in * for each partition. The end offset is excluded to be consistent with the semantics of * [[RocketMQSourceOffset]] and `MQPullConsumer.fetchConsumeOffset()`. * * - The DF returned is based on [[RocketMQSourceRDD]] which is constructed such that the * data from RocketMQ topic + partition is consistently read by the same executors across * batches, and cached RocketMQConsumers in the executors can be reused efficiently. See the * docs on [[RocketMQSourceRDD]] for more details. * * Zero data lost is not guaranteed when topics are deleted. If zero data lost is critical, the user * must make sure all messages in a topic have been processed when deleting a topic. */ private class RocketMQSource( sqlContext: SQLContext, offsetReader: RocketMQOffsetReader, executorRocketMQParams: ju.Map[String, String], sourceOptions: Map[String, String], metadataPath: String, startingOffsets: RocketMQOffsetRangeLimit, failOnDataLoss: Boolean) extends Source with Logging { private val sc = sqlContext.sparkContext private val pollTimeoutMs = sourceOptions.getOrElse( RocketMQConf.PULL_TIMEOUT_MS, sc.conf.getTimeAsMs("spark.network.timeout", "120s").toString ).toLong private val maxOffsetsPerTrigger = sourceOptions.get("maxOffsetsPerTrigger").map(_.toLong) /** * Lazily initialize `initialPartitionOffsets` to make sure that `RocketMQConsumer.pull` is only * called in StreamExecutionThread. */ private lazy val initialPartitionOffsets = { val metadataLog = new HDFSMetadataLog[RocketMQSourceOffset](sqlContext.sparkSession, metadataPath) { override def serialize(metadata: RocketMQSourceOffset, out: OutputStream): Unit = { out.write(0) // A zero byte is written to support Spark 2.1.0 (SPARK-19517) val writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8)) writer.write("v" + VERSION + "\n") writer.write(metadata.json) writer.flush() } override def deserialize(in: InputStream): RocketMQSourceOffset = { in.read() // A zero byte is read to support Spark 2.1.0 (SPARK-19517) val content = IOUtils.toString(new InputStreamReader(in, StandardCharsets.UTF_8)) // HDFSMetadataLog guarantees that it never creates a partial file. assert(content.length != 0) if (content(0) == 'v') { val indexOfNewLine = content.indexOf("\n") if (indexOfNewLine > 0) { val version = parseVersion(content.substring(0, indexOfNewLine), VERSION) RocketMQSourceOffset(SerializedOffset(content.substring(indexOfNewLine + 1))) } else { throw new IllegalStateException( s"Log file was malformed: failed to detect the log file version line.") } } else { // The log was generated by Spark 2.1.0 RocketMQSourceOffset(SerializedOffset(content)) } } } metadataLog.get(0).getOrElse { val offsets = startingOffsets match { case EarliestOffsetRangeLimit => RocketMQSourceOffset(offsetReader.fetchEarliestOffsets()) case LatestOffsetRangeLimit => RocketMQSourceOffset(offsetReader.fetchLatestOffsets()) case SpecificOffsetRangeLimit(p) => offsetReader.fetchSpecificOffsets(p, reportDataLoss) } metadataLog.add(0, offsets) logInfo(s"Initial offsets: $offsets") offsets }.queueToOffsets } private var currentPartitionOffsets: Option[Map[MessageQueue, Long]] = None override def schema: StructType = RocketMQSource.schema /** Returns the maximum available offset for this source. */ override def getOffset: Option[Offset] = { // Make sure initialPartitionOffsets is initialized initialPartitionOffsets val latest = offsetReader.fetchLatestOffsets() val offsets = maxOffsetsPerTrigger match { case None => latest case Some(limit) if currentPartitionOffsets.isEmpty => rateLimit(limit, initialPartitionOffsets, latest) case Some(limit) => rateLimit(limit, currentPartitionOffsets.get, latest) } currentPartitionOffsets = Some(offsets) logDebug(s"GetOffset: ${offsets.toSeq.map(_.toString).sorted}") Some(RocketMQSourceOffset(offsets)) } /** Proportionally distribute limit number of offsets among message queues */ private def rateLimit( limit: Long, from: Map[MessageQueue, Long], until: Map[MessageQueue, Long]): Map[MessageQueue, Long] = { val fromNew = offsetReader.fetchEarliestOffsets(until.keySet.diff(from.keySet).toSeq) val sizes = until.flatMap { case (tp, end) => // If begin isn't defined, something's wrong, but let alert logic in getBatch handle it from.get(tp).orElse(fromNew.get(tp)).flatMap { begin => val size = end - begin logDebug(s"rateLimit $tp size is $size") if (size > 0) Some(tp -> size) else None } } val total = sizes.values.sum.toDouble if (total < 1) { until } else { until.map { case (tp, end) => tp -> sizes.get(tp).map { size => val begin = from.get(tp).getOrElse(fromNew(tp)) val prorate = limit * (size / total) logDebug(s"rateLimit $tp prorated amount is $prorate") // Don't completely starve small topicpartitions val off = begin + (if (prorate < 1) Math.ceil(prorate) else Math.floor(prorate)).toLong logDebug(s"rateLimit $tp new offset is $off") // Paranoia, make sure not to return an offset that's past end Math.min(end, off) }.getOrElse(end) } } } /** * Returns the data that is between the offsets * [`start.get.partitionToOffsets`, `end.partitionToOffsets`), i.e. end.partitionToOffsets is * exclusive. */ override def getBatch(start: Option[Offset], end: Offset): DataFrame = { // Make sure initialPartitionOffsets is initialized initialPartitionOffsets logInfo(s"GetBatch called with start = $start, end = $end") val untilPartitionOffsets = RocketMQSourceOffset.getPartitionOffsets(end) // On recovery, getBatch will get called before getOffset if (currentPartitionOffsets.isEmpty) { currentPartitionOffsets = Some(untilPartitionOffsets) } if (start.isDefined && start.get == end) { return sqlContext.internalCreateDataFrame( sqlContext.sparkContext.emptyRDD, schema, isStreaming = true) } val fromPartitionOffsets = start match { case Some(prevBatchEndOffset) => RocketMQSourceOffset.getPartitionOffsets(prevBatchEndOffset) case None => initialPartitionOffsets } // Find the new partitions, and get their earliest offsets val newPartitions = untilPartitionOffsets.keySet.diff(fromPartitionOffsets.keySet) val newPartitionOffsets = offsetReader.fetchEarliestOffsets(newPartitions.toSeq) if (newPartitionOffsets.keySet != newPartitions) { // We cannot get from offsets for some partitions. It means they got deleted. val deletedPartitions = newPartitions.diff(newPartitionOffsets.keySet) reportDataLoss( s"Cannot find earliest offsets of $deletedPartitions. Some data may have been missed") } logInfo(s"Partitions added: $newPartitionOffsets") newPartitionOffsets.filter(_._2 != 0).foreach { case (p, o) => reportDataLoss( s"Added partition $p starts from $o instead of 0. Some data may have been missed") } val deletedPartitions = fromPartitionOffsets.keySet.diff(untilPartitionOffsets.keySet) if (deletedPartitions.nonEmpty) { reportDataLoss(s"$deletedPartitions are gone. Some data may have been missed") } // Use the until partitions to calculate offset ranges to ignore partitions that have // been deleted val topicPartitions = untilPartitionOffsets.keySet.filter { tp => // Ignore partitions that we don't know the from offsets. newPartitionOffsets.contains(tp) || fromPartitionOffsets.contains(tp) }.toSeq logDebug("TopicPartitions: " + topicPartitions.mkString(", ")) val sortedExecutors = getSortedExecutorList(sc) val numExecutors = sortedExecutors.length logDebug("Sorted executors: " + sortedExecutors.mkString(", ")) // Calculate offset ranges val offsetRanges = topicPartitions.map { tp => val fromOffset = fromPartitionOffsets.getOrElse(tp, { newPartitionOffsets.getOrElse(tp, { // This should not happen since newPartitionOffsets contains all partitions not in // fromPartitionOffsets throw new IllegalStateException(s"$tp doesn't have a from offset") }) }) val untilOffset = untilPartitionOffsets(tp) val preferredLoc = if (numExecutors > 0) { // This allows cached RocketMQConsumers in the executors to be re-used to read the same // partition in every batch. Some(sortedExecutors(Math.floorMod(tp.hashCode, numExecutors))) } else None RocketMQSourceRDDOffsetRange(tp, fromOffset, untilOffset, preferredLoc) }.filter { range => if (range.untilOffset < range.fromOffset) { reportDataLoss(s"Partition ${range.messageQueue}'s offset was changed from " + s"${range.fromOffset} to ${range.untilOffset}, some data may have been missed") false } else { true } }.toArray // Create an RDD that reads from RocketMQ and get the (key, value) pair as byte arrays. val rdd = new RocketMQSourceRDD( sc, executorRocketMQParams, offsetRanges, pollTimeoutMs, failOnDataLoss, reuseRocketMQConsumer = true).map { cr => // Remove the `brokerName` property which was added by us. See `RocketMQSourceRDD.compute` val brokerName = cr.getProperties.remove(RocketMQSource.PROP_BROKER_NAME) InternalRow( UTF8String.fromString(cr.getTopic), // topic cr.getFlag, // flag cr.getBody, // body UTF8String.fromString(JsonUtils.messageProperties(cr.getProperties)), // properties UTF8String.fromString(brokerName), // brokerName cr.getQueueId, // queueId cr.getQueueOffset, // queueOffset DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.getBornTimestamp)), // bornTimestamp DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.getStoreTimestamp)) // storeTimestamp ) } logInfo("GetBatch generating RDD of offset range: " + offsetRanges.sortBy(_.messageQueue.toString).mkString(", ")) sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true) } /** Stop this source and free any resources it has allocated. */ override def stop(): Unit = synchronized { offsetReader.close() } override def toString: String = s"RocketMQSource[$offsetReader]" /** * If `failOnDataLoss` is true, this method will throw an `IllegalStateException`. * Otherwise, just log a warning. */ private def reportDataLoss(message: String): Unit = { if (failOnDataLoss) { throw new IllegalStateException(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE") } else { logWarning(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE") } } } /** Companion object for the [[RocketMQSource]]. */ private object RocketMQSource { val INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE = """ |Some data may have been lost because they are not available in RocketMQ any more; either the | data was aged out by RocketMQ or the topic may have been deleted before all the data in the | topic was processed. If you want your streaming query to fail on such cases, set the source | option "failOnDataLoss" to "true". """.stripMargin val INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE = """ |Some data may have been lost because they are not available in RocketMQ any more; either the | data was aged out by RocketMQ or the topic may have been deleted before all the data in the | topic was processed. If you don't want your streaming query to fail on such cases, set the | source option "failOnDataLoss" to "false". """.stripMargin val VERSION = 1 val PROP_BROKER_NAME = "_brokerName" def getSortedExecutorList(sc: SparkContext): Array[String] = { val bm = sc.env.blockManager bm.master.getPeers(bm.blockManagerId).toArray .map(x => ExecutorCacheTaskLocation(x.host, x.executorId)) .sortWith(compare) .map(_.toString) } private def compare(a: ExecutorCacheTaskLocation, b: ExecutorCacheTaskLocation): Boolean = { if (a.host == b.host) { a.executorId > b.executorId } else { a.host > b.host } } def schema: StructType = StructType(Seq( // fields of `Message` StructField("topic", StringType), StructField("flag", IntegerType), StructField("body", BinaryType), StructField("properties", StringType), // fields of `MessageExt` StructField("brokerName", StringType), StructField("queueId", IntegerType), StructField("queueOffset", LongType), StructField("bornTimestamp", TimestampType), StructField("storeTimestamp", TimestampType) )) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-2.4/org/apache/spark/sql/rocketmq/RocketMQSourceOffset.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSourceOffset.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import org.apache.rocketmq.common.message.MessageQueue import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset} import org.apache.spark.sql.sources.v2.reader.streaming.{PartitionOffset, Offset => OffsetV2} /** * An [[Offset]] for the [[RocketMQSource]]. This one tracks all partitions of subscribed topics and * their offsets. */ private[rocketmq] case class RocketMQSourceOffset(queueToOffsets: Map[MessageQueue, Long]) extends OffsetV2 { override val json = JsonUtils.partitionOffsets(queueToOffsets) } private[rocketmq] case class RocketMQSourcePartitionOffset(messageQueue: MessageQueue, queueOffset: Long) extends PartitionOffset /** Companion object of the [[RocketMQSourceOffset]] */ private[rocketmq] object RocketMQSourceOffset { def getPartitionOffsets(offset: Offset): Map[MessageQueue, Long] = { offset match { case o: RocketMQSourceOffset => o.queueToOffsets case so: SerializedOffset => RocketMQSourceOffset(so).queueToOffsets case _ => throw new IllegalArgumentException( s"Invalid conversion from offset of ${offset.getClass} to RocketMQSourceOffset") } } /** * Returns [[RocketMQSourceOffset]] from a variable sequence of (topic, brokerName, queueId, offset) * tuples. */ def apply(offsetTuples: (String, String, Int, Long)*): RocketMQSourceOffset = { RocketMQSourceOffset(offsetTuples.map { case(t, b, q, o) => (new MessageQueue(t, b, q), o) }.toMap) } /** * Returns [[RocketMQSourceOffset]] from a JSON [[SerializedOffset]] */ def apply(offset: SerializedOffset): RocketMQSourceOffset = RocketMQSourceOffset(JsonUtils.partitionOffsets(offset.json)) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-2.4/org/apache/spark/sql/rocketmq/RocketMQSourceRDDOffsetRange.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSourceRDD.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import java.{util => ju} import org.apache.rocketmq.common.message.{MessageExt, MessageQueue} import org.apache.spark.partial.{BoundedDouble, PartialResult} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.util.NextIterator import org.apache.spark.{Partition, SparkContext, TaskContext} import scala.collection.mutable.ArrayBuffer /** Offset range that one partition of the RocketMQSourceRDD has to read */ private[rocketmq] case class RocketMQSourceRDDOffsetRange( messageQueue: MessageQueue, fromOffset: Long, untilOffset: Long, preferredLoc: Option[String]) { def size: Long = untilOffset - fromOffset } /** Partition of the RocketMQSourceRDD */ private[rocketmq] case class RocketMQSourceRDDPartition(index: Int, offsetRange: RocketMQSourceRDDOffsetRange) extends Partition /** * An RDD that reads data from RocketMQ based on offset ranges across multiple partitions. * Additionally, it allows preferred locations to be set for each topic + partition, so that * the [[RocketMQSource]] can ensure the same executor always reads the same topic + partition * and cached RocketMQConsumers (see [[CachedRocketMQConsumer]] can be used read data efficiently. * * @param sc the [[SparkContext]] * @param executorRocketMQParams RocketMQ configuration for creating RocketMQConsumer on the executors * @param offsetRanges Offset ranges that define the RocketMQ data belonging to this RDD */ private[rocketmq] class RocketMQSourceRDD( sc: SparkContext, executorRocketMQParams: ju.Map[String, String], offsetRanges: Seq[RocketMQSourceRDDOffsetRange], pollTimeoutMs: Long, failOnDataLoss: Boolean, reuseRocketMQConsumer: Boolean) extends RDD[MessageExt](sc, Nil) { override def persist(newLevel: StorageLevel): this.type = { logError("RocketMQ ConsumerRecord is not serializable. " + "Use .map to extract fields before calling .persist or .window") super.persist(newLevel) } override def getPartitions: Array[Partition] = { offsetRanges.zipWithIndex.map { case (o, i) => RocketMQSourceRDDPartition(i, o) }.toArray } override def count(): Long = offsetRanges.map(_.size).sum override def countApprox(timeout: Long, confidence: Double): PartialResult[BoundedDouble] = { val c = count() new PartialResult(new BoundedDouble(c, 1.0, c, c), true) } override def isEmpty(): Boolean = count == 0L override def take(num: Int): Array[MessageExt] = { val nonEmptyPartitions = this.partitions .map(_.asInstanceOf[RocketMQSourceRDDPartition]) .filter(_.offsetRange.size > 0) if (num < 1 || nonEmptyPartitions.isEmpty) { return new Array[MessageExt](0) } // Determine in advance how many messages need to be taken from each partition val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) => val remain = num - result.values.sum if (remain > 0) { val taken = Math.min(remain, part.offsetRange.size) result + (part.index -> taken.toInt) } else { result } } val buf = new ArrayBuffer[MessageExt] val res = context.runJob( this, (tc: TaskContext, it: Iterator[MessageExt]) => it.take(parts(tc.partitionId())).toArray, parts.keys.toArray ) res.foreach(buf ++= _) buf.toArray } override def getPreferredLocations(split: Partition): Seq[String] = { val part = split.asInstanceOf[RocketMQSourceRDDPartition] part.offsetRange.preferredLoc.map(Seq(_)).getOrElse(Seq.empty) } override def compute( thePart: Partition, context: TaskContext): Iterator[MessageExt] = { val sourcePartition = thePart.asInstanceOf[RocketMQSourceRDDPartition] val consumer = if (!reuseRocketMQConsumer) { CachedRocketMQConsumer.getOrCreate(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } else { CachedRocketMQConsumer.createUncached(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } val range = resolveRange(consumer, sourcePartition.offsetRange) assert( range.fromOffset <= range.untilOffset, s"Beginning offset ${range.fromOffset} is after the ending offset ${range.untilOffset} for " + s"${range.messageQueue}. You either provided an invalid fromOffset, or the RocketMQ topic has been damaged") if (range.fromOffset == range.untilOffset) { logInfo(s"Beginning offset ${range.fromOffset} is the same as ending offset, " + s"skipping ${range.messageQueue}") Iterator.empty } else { val underlying = new NextIterator[MessageExt]() { private var requestOffset = range.fromOffset override def getNext(): MessageExt = { if (requestOffset >= range.untilOffset) { // Processed all offsets in this partition. finished = true null } else { val r = consumer.get(requestOffset, range.untilOffset, pollTimeoutMs, failOnDataLoss) if (r == null) { // Losing some data. Skip the rest offsets in this partition. finished = true null } else { requestOffset = r.getQueueOffset + 1 // The MessageExt structure does not contains any field of `brokerName`, so put one into properties r.putUserProperty(RocketMQSource.PROP_BROKER_NAME, sourcePartition.offsetRange.messageQueue.getBrokerName) r } } } override protected def close(): Unit = { if (!reuseRocketMQConsumer) { consumer.close() } else { CachedRocketMQConsumer.releaseConsumer(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } } } // Release consumer, either by removing it or indicating we're no longer using it context.addTaskCompletionListener { _ => underlying.closeIfNeeded() } underlying } } /** * Resolve the EARLIEST/LATEST placeholder in range * @return the range with actual boundary */ private def resolveRange(consumer: CachedRocketMQConsumer, range: RocketMQSourceRDDOffsetRange) = { if (range.fromOffset < 0 || range.untilOffset < 0) { // Late bind the offset range val availableOffsetRange = consumer.getAvailableOffsetRange() val fromOffset = if (range.fromOffset < 0) { assert(range.fromOffset == RocketMQOffsetRangeLimit.EARLIEST, s"earliest offset ${range.fromOffset} does not equal ${RocketMQOffsetRangeLimit.EARLIEST}") availableOffsetRange.earliest } else { range.fromOffset } val untilOffset = if (range.untilOffset < 0) { assert(range.untilOffset == RocketMQOffsetRangeLimit.LATEST, s"latest offset ${range.untilOffset} does not equal ${RocketMQOffsetRangeLimit.LATEST}") availableOffsetRange.latest } else { range.untilOffset } RocketMQSourceRDDOffsetRange(range.messageQueue, fromOffset, untilOffset, range.preferredLoc) } else { range } } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-2.4/org.apache.spark.streaming/RocketMqRDD.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.streaming import org.apache.rocketmq.common.message.MessageExt import org.apache.rocketmq.spark._ import org.apache.spark.partial.{BoundedDouble, PartialResult} import org.apache.spark.rdd.RDD import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.storage.StorageLevel import org.apache.spark.{Partition, SparkContext, TaskContext} import java.{util => ju} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer /** * A batch-oriented interface for consuming from RocketMq. * Starting and ending offsets are specified in advance, * so that you can control exactly-once semantics. * @param groupId it is for rocketMq for identifying the consumer * @param optionParams the configs * @param offsetRanges offset ranges that define the RocketMq data belonging to this RDD * @param preferredHosts map from TopicQueueId to preferred host for processing that partition. * In most cases, use [[LocationStrategy.PreferConsistent]] * @param useConsumerCache useConsumerCache whether to use a consumer from a per-jvm cache */ class RocketMqRDD ( sc: SparkContext, val groupId: String, val optionParams: ju.Map[String, String], val offsetRanges: ju.Map[TopicQueueId, Array[OffsetRange]], val preferredHosts: ju.Map[TopicQueueId, String], val useConsumerCache: Boolean )extends RDD[MessageExt](sc, Nil) with HasOffsetRanges{ private val cacheInitialCapacity = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_INIT_CAPACITY, "16").toInt private val cacheMaxCapacity = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_MAX_CAPACITY, "64").toInt private val cacheLoadFactor = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_LOAD_FACTOR, "0.75").toFloat override def persist(newLevel: StorageLevel): this.type = { super.persist(newLevel) } override def getPartitions: Array[Partition] = { offsetRanges.asScala.toArray.zipWithIndex.map{ case ((first, second), i) => new RocketMqRDDPartition(i, first.topic, first.queueId, second) }.toArray } override def count(): Long = offsetRanges.asScala.map(_._2.map(_.count).sum).sum override def countApprox( timeout: Long, confidence: Double = 0.95 ): PartialResult[BoundedDouble] = { val c = count new PartialResult(new BoundedDouble(c, 1.0, c, c), true) } override def isEmpty(): Boolean = count == 0L override def take(num: Int): Array[MessageExt] = { val nonEmptyPartitions = this.partitions .map(_.asInstanceOf[RocketMqRDDPartition]) .filter(_.count > 0) if (num < 1 || nonEmptyPartitions.isEmpty) { return new Array[MessageExt](0) } // Determine in advance how many messages need to be taken from each partition val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) => val remain = num - result.values.sum if (remain > 0) { val taken = Math.min(remain, part.count) result + (part.index -> taken.toInt) } else { result } } val buf = new ArrayBuffer[MessageExt] val res = context.runJob( this, (tc: TaskContext, it: Iterator[MessageExt]) => it.take(parts(tc.partitionId)).toArray, parts.keys.toArray ) res.foreach(buf ++= _) buf.toArray } private def executors(): Array[ExecutorCacheTaskLocation] = { val bm = sparkContext.env.blockManager bm.master.getPeers(bm.blockManagerId).toArray .map(x => ExecutorCacheTaskLocation(x.host, x.executorId)) .sortWith(compareExecutors) } private def compareExecutors( a: ExecutorCacheTaskLocation, b: ExecutorCacheTaskLocation): Boolean = if (a.host == b.host) { a.executorId > b.executorId } else { a.host > b.host } /** * Non-negative modulus, from java 8 math */ private def floorMod(a: Int, b: Int): Int = ((a % b) + b) % b protected override def getPreferredLocations(thePart: Partition): Seq[String] = { // The intention is best-effort consistent executor for a given topic partition, // so that caching consumers can be effective. val part = thePart.asInstanceOf[RocketMqRDDPartition] val allExecs = executors() val tp = part.topicQueueId() val prefHost = preferredHosts.get(tp) val prefExecs = if (null == prefHost) allExecs else allExecs.filter(_.host == prefHost) val execs = if (prefExecs.isEmpty) allExecs else prefExecs if (execs.isEmpty) { Seq() } else { // execs is sorted, tp.hashCode depends only on topic and partition, so consistent index val index = this.floorMod(tp.hashCode, execs.length) val chosen = execs(index) Seq(chosen.toString) } } private def errBeginAfterEnd(part: RocketMqRDDPartition): String = s"Beginning offset is after the ending offset ${part.partitionOffsetRanges.mkString(",")} " + s"for topic ${part.topic} partition ${part.index}. " + "You either provided an invalid fromOffset, or the Kafka topic has been damaged" override def compute(thePart: Partition, context: TaskContext): Iterator[MessageExt] = { val part = thePart.asInstanceOf[RocketMqRDDPartition] val count = part.count() assert(count >= 0, errBeginAfterEnd(part)) if (count == 0) { logInfo(s"Beginning offset is the same as ending offset " + s"skipping ${part.topic} ${part.queueId}") Iterator.empty } else { new RocketMqRDDIterator(part, context) } } /** * An iterator that fetches messages directly from rocketmq for the offsets in partition. * Uses a cached consumer where possible to take advantage of prefetching */ private class RocketMqRDDIterator( part: RocketMqRDDPartition, context: TaskContext) extends Iterator[MessageExt] { logDebug(s"Computing topic ${part.topic}, queueId ${part.queueId} " + s"offsets ${part.partitionOffsetRanges.mkString(",")}") context.addTaskCompletionListener{ context => closeIfNeeded() } val consumer = if (useConsumerCache) { CachedMQConsumer.init(cacheInitialCapacity, cacheMaxCapacity, cacheLoadFactor) if (context.attemptNumber > 5) { // just in case the prior attempt failures were cache related CachedMQConsumer.remove(groupId, part.topic, part.queueId, part.brokerNames) } CachedMQConsumer.getOrCreate(groupId, part.topic, part.queueId, part.brokerNames, optionParams) } else { CachedMQConsumer.getUncached(groupId, part.topic, part.queueId, part.brokerNames, optionParams) } var logicTotalOffset = 0 val totalSum = part.partitionOffsetRanges.map(_.count).sum var index = 0 var requestOffset = part.partitionOffsetRanges.apply(index).fromOffset def closeIfNeeded(): Unit = { if (!useConsumerCache && consumer != null) { consumer.client.shutdown } } override def hasNext(): Boolean = { totalSum > logicTotalOffset } override def next(): MessageExt = { assert(hasNext(), "Can't call getNext() once untilOffset has been reached") val queueRange = part.partitionOffsetRanges.apply(index) val r = consumer.get(queueRange.brokerName, requestOffset) if (queueRange.untilOffset > (requestOffset + 1)) requestOffset +=1 else { index +=1 if (part.partitionOffsetRanges.length > index) requestOffset = part.partitionOffsetRanges.apply(index).fromOffset } logicTotalOffset += 1 r } } private[RocketMqRDD] type OffsetRangeTuple = (String, Int) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-3.0/org/apache/spark/sql/rocketmq/RocketMQSource.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSource.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ * 2. Schema of output dataframe adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import java.io._ import java.nio.charset.StandardCharsets import java.{util => ju} import org.apache.commons.io.IOUtils import org.apache.rocketmq.common.message.MessageQueue import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.rocketmq.RocketMQSource._ import org.apache.spark.sql.types.{StructField, _} import org.apache.spark.unsafe.types.UTF8String /** * A [[Source]] that reads data from RocketMQ using the following design. * * - The [[RocketMQSourceOffset]] is the custom [[Offset]] defined for this source that contains * a map of MessageQueue -> offset. Note that this offset is 1 + (available offset). For * example if the last record in a RocketMQ topic "t", partition 2 is offset 5, then * RocketMQSourceOffset will contain MessageQueue("t", 2) -> 6. This is done keep it consistent * with the semantics of `MQPullConsumer.fetchConsumeOffset()`. * * - The [[RocketMQSource]] written to do the following. * * - As soon as the source is created, the pre-configured [[RocketMQOffsetReader]] * is used to query the initial offsets that this source should * start reading from. This is used to create the first batch. * * - `getOffset()` uses the [[RocketMQOffsetReader]] to query the latest * available offsets, which are returned as a [[RocketMQSourceOffset]]. * * - `getBatch()` returns a DF that reads from the 'start offset' until the 'end offset' in * for each partition. The end offset is excluded to be consistent with the semantics of * [[RocketMQSourceOffset]] and `MQPullConsumer.fetchConsumeOffset()`. * * - The DF returned is based on [[RocketMQSourceRDD]] which is constructed such that the * data from RocketMQ topic + partition is consistently read by the same executors across * batches, and cached RocketMQConsumers in the executors can be reused efficiently. See the * docs on [[RocketMQSourceRDD]] for more details. * * Zero data lost is not guaranteed when topics are deleted. If zero data lost is critical, the user * must make sure all messages in a topic have been processed when deleting a topic. */ private class RocketMQSource( sqlContext: SQLContext, offsetReader: RocketMQOffsetReader, executorRocketMQParams: ju.Map[String, String], sourceOptions: Map[String, String], metadataPath: String, startingOffsets: RocketMQOffsetRangeLimit, failOnDataLoss: Boolean) extends Source with Logging { private val sc = sqlContext.sparkContext private val pollTimeoutMs = sourceOptions.getOrElse( RocketMQConf.PULL_TIMEOUT_MS, sc.conf.getTimeAsMs("spark.network.timeout", "120s").toString ).toLong private val maxOffsetsPerTrigger = sourceOptions.get("maxOffsetsPerTrigger").map(_.toLong) /** * Lazily initialize `initialPartitionOffsets` to make sure that `RocketMQConsumer.pull` is only * called in StreamExecutionThread. */ private lazy val initialPartitionOffsets = { val metadataLog = new HDFSMetadataLog[RocketMQSourceOffset](sqlContext.sparkSession, metadataPath) { override def serialize(metadata: RocketMQSourceOffset, out: OutputStream): Unit = { out.write(0) // A zero byte is written to support Spark 2.1.0 (SPARK-19517) val writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8)) writer.write("v" + VERSION + "\n") writer.write(metadata.json) writer.flush() } override def deserialize(in: InputStream): RocketMQSourceOffset = { in.read() // A zero byte is read to support Spark 2.1.0 (SPARK-19517) val content = IOUtils.toString(new InputStreamReader(in, StandardCharsets.UTF_8)) // HDFSMetadataLog guarantees that it never creates a partial file. assert(content.length != 0) if (content(0) == 'v') { val indexOfNewLine = content.indexOf("\n") if (indexOfNewLine > 0) { val version = validateVersion(content.substring(0, indexOfNewLine), VERSION) RocketMQSourceOffset(SerializedOffset(content.substring(indexOfNewLine + 1))) } else { throw new IllegalStateException( s"Log file was malformed: failed to detect the log file version line.") } } else { // The log was generated by Spark 2.1.0 RocketMQSourceOffset(SerializedOffset(content)) } } } metadataLog.get(0).getOrElse { val offsets = startingOffsets match { case EarliestOffsetRangeLimit => RocketMQSourceOffset(offsetReader.fetchEarliestOffsets()) case LatestOffsetRangeLimit => RocketMQSourceOffset(offsetReader.fetchLatestOffsets()) case SpecificOffsetRangeLimit(p) => offsetReader.fetchSpecificOffsets(p, reportDataLoss) } metadataLog.add(0, offsets) logInfo(s"Initial offsets: $offsets") offsets }.queueToOffsets } private var currentPartitionOffsets: Option[Map[MessageQueue, Long]] = None override def schema: StructType = RocketMQSource.schema /** Returns the maximum available offset for this source. */ override def getOffset: Option[Offset] = { // Make sure initialPartitionOffsets is initialized initialPartitionOffsets val latest = offsetReader.fetchLatestOffsets() val offsets = maxOffsetsPerTrigger match { case None => latest case Some(limit) if currentPartitionOffsets.isEmpty => rateLimit(limit, initialPartitionOffsets, latest) case Some(limit) => rateLimit(limit, currentPartitionOffsets.get, latest) } currentPartitionOffsets = Some(offsets) logDebug(s"GetOffset: ${offsets.toSeq.map(_.toString).sorted}") Some(RocketMQSourceOffset(offsets)) } /** Proportionally distribute limit number of offsets among message queues */ private def rateLimit( limit: Long, from: Map[MessageQueue, Long], until: Map[MessageQueue, Long]): Map[MessageQueue, Long] = { val fromNew = offsetReader.fetchEarliestOffsets(until.keySet.diff(from.keySet).toSeq) val sizes = until.flatMap { case (tp, end) => // If begin isn't defined, something's wrong, but let alert logic in getBatch handle it from.get(tp).orElse(fromNew.get(tp)).flatMap { begin => val size = end - begin logDebug(s"rateLimit $tp size is $size") if (size > 0) Some(tp -> size) else None } } val total = sizes.values.sum.toDouble if (total < 1) { until } else { until.map { case (tp, end) => tp -> sizes.get(tp).map { size => val begin = from.get(tp).getOrElse(fromNew(tp)) val prorate = limit * (size / total) logDebug(s"rateLimit $tp prorated amount is $prorate") // Don't completely starve small topicpartitions val off = begin + (if (prorate < 1) Math.ceil(prorate) else Math.floor(prorate)).toLong logDebug(s"rateLimit $tp new offset is $off") // Paranoia, make sure not to return an offset that's past end Math.min(end, off) }.getOrElse(end) } } } /** * Returns the data that is between the offsets * [`start.get.partitionToOffsets`, `end.partitionToOffsets`), i.e. end.partitionToOffsets is * exclusive. */ override def getBatch(start: Option[Offset], end: Offset): DataFrame = { // Make sure initialPartitionOffsets is initialized initialPartitionOffsets logInfo(s"GetBatch called with start = $start, end = $end") val untilPartitionOffsets = RocketMQSourceOffset.getPartitionOffsets(end) // On recovery, getBatch will get called before getOffset if (currentPartitionOffsets.isEmpty) { currentPartitionOffsets = Some(untilPartitionOffsets) } if (start.isDefined && start.get == end) { return sqlContext.internalCreateDataFrame( sqlContext.sparkContext.emptyRDD, schema, isStreaming = true) } val fromPartitionOffsets = start match { case Some(prevBatchEndOffset) => RocketMQSourceOffset.getPartitionOffsets(prevBatchEndOffset) case None => initialPartitionOffsets } // Find the new partitions, and get their earliest offsets val newPartitions = untilPartitionOffsets.keySet.diff(fromPartitionOffsets.keySet) val newPartitionOffsets = offsetReader.fetchEarliestOffsets(newPartitions.toSeq) if (newPartitionOffsets.keySet != newPartitions) { // We cannot get from offsets for some partitions. It means they got deleted. val deletedPartitions = newPartitions.diff(newPartitionOffsets.keySet) reportDataLoss( s"Cannot find earliest offsets of $deletedPartitions. Some data may have been missed") } logInfo(s"Partitions added: $newPartitionOffsets") newPartitionOffsets.filter(_._2 != 0).foreach { case (p, o) => reportDataLoss( s"Added partition $p starts from $o instead of 0. Some data may have been missed") } val deletedPartitions = fromPartitionOffsets.keySet.diff(untilPartitionOffsets.keySet) if (deletedPartitions.nonEmpty) { reportDataLoss(s"$deletedPartitions are gone. Some data may have been missed") } // Use the until partitions to calculate offset ranges to ignore partitions that have // been deleted val topicPartitions = untilPartitionOffsets.keySet.filter { tp => // Ignore partitions that we don't know the from offsets. newPartitionOffsets.contains(tp) || fromPartitionOffsets.contains(tp) }.toSeq logDebug("TopicPartitions: " + topicPartitions.mkString(", ")) val sortedExecutors = getSortedExecutorList(sc) val numExecutors = sortedExecutors.length logDebug("Sorted executors: " + sortedExecutors.mkString(", ")) // Calculate offset ranges val offsetRanges = topicPartitions.map { tp => val fromOffset = fromPartitionOffsets.getOrElse(tp, { newPartitionOffsets.getOrElse(tp, { // This should not happen since newPartitionOffsets contains all partitions not in // fromPartitionOffsets throw new IllegalStateException(s"$tp doesn't have a from offset") }) }) val untilOffset = untilPartitionOffsets(tp) val preferredLoc = if (numExecutors > 0) { // This allows cached RocketMQConsumers in the executors to be re-used to read the same // partition in every batch. Some(sortedExecutors(Math.floorMod(tp.hashCode, numExecutors))) } else None RocketMQSourceRDDOffsetRange(tp, fromOffset, untilOffset, preferredLoc) }.filter { range => if (range.untilOffset < range.fromOffset) { reportDataLoss(s"Partition ${range.messageQueue}'s offset was changed from " + s"${range.fromOffset} to ${range.untilOffset}, some data may have been missed") false } else { true } }.toArray // Create an RDD that reads from RocketMQ and get the (key, value) pair as byte arrays. val rdd = new RocketMQSourceRDD( sc, executorRocketMQParams, offsetRanges, pollTimeoutMs, failOnDataLoss, reuseRocketMQConsumer = true).map { cr => // Remove the `brokerName` property which was added by us. See `RocketMQSourceRDD.compute` val brokerName = cr.getProperties.remove(RocketMQSource.PROP_BROKER_NAME) InternalRow( UTF8String.fromString(cr.getTopic), // topic cr.getFlag, // flag cr.getBody, // body UTF8String.fromString(JsonUtils.messageProperties(cr.getProperties)), // properties UTF8String.fromString(brokerName), // brokerName cr.getQueueId, // queueId cr.getQueueOffset, // queueOffset DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.getBornTimestamp)), // bornTimestamp DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.getStoreTimestamp)) // storeTimestamp ) } logInfo("GetBatch generating RDD of offset range: " + offsetRanges.sortBy(_.messageQueue.toString).mkString(", ")) sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true) } /** Stop this source and free any resources it has allocated. */ override def stop(): Unit = synchronized { offsetReader.close() } override def toString: String = s"RocketMQSource[$offsetReader]" /** * If `failOnDataLoss` is true, this method will throw an `IllegalStateException`. * Otherwise, just log a warning. */ private def reportDataLoss(message: String): Unit = { if (failOnDataLoss) { throw new IllegalStateException(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE") } else { logWarning(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE") } } } /** Companion object for the [[RocketMQSource]]. */ private object RocketMQSource { val INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE = """ |Some data may have been lost because they are not available in RocketMQ any more; either the | data was aged out by RocketMQ or the topic may have been deleted before all the data in the | topic was processed. If you want your streaming query to fail on such cases, set the source | option "failOnDataLoss" to "true". """.stripMargin val INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE = """ |Some data may have been lost because they are not available in RocketMQ any more; either the | data was aged out by RocketMQ or the topic may have been deleted before all the data in the | topic was processed. If you don't want your streaming query to fail on such cases, set the | source option "failOnDataLoss" to "false". """.stripMargin val VERSION = 1 val PROP_BROKER_NAME = "_brokerName" def getSortedExecutorList(sc: SparkContext): Array[String] = { val bm = sc.env.blockManager bm.master.getPeers(bm.blockManagerId).toArray .map(x => ExecutorCacheTaskLocation(x.host, x.executorId)) .sortWith(compare) .map(_.toString) } private def compare(a: ExecutorCacheTaskLocation, b: ExecutorCacheTaskLocation): Boolean = { if (a.host == b.host) { a.executorId > b.executorId } else { a.host > b.host } } def schema: StructType = StructType(Seq( // fields of `Message` StructField("topic", StringType), StructField("flag", IntegerType), StructField("body", BinaryType), StructField("properties", StringType), // fields of `MessageExt` StructField("brokerName", StringType), StructField("queueId", IntegerType), StructField("queueOffset", LongType), StructField("bornTimestamp", TimestampType), StructField("storeTimestamp", TimestampType) )) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-3.0/org/apache/spark/sql/rocketmq/RocketMQSourceOffset.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSourceOffset.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import org.apache.rocketmq.common.message.MessageQueue import org.apache.spark.sql.connector.read.streaming.PartitionOffset import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset} /** * An [[Offset]] for the [[RocketMQSource]]. This one tracks all partitions of subscribed topics and * their offsets. */ private[rocketmq] case class RocketMQSourceOffset(queueToOffsets: Map[MessageQueue, Long]) extends Offset { override val json = JsonUtils.partitionOffsets(queueToOffsets) } private[rocketmq] case class RocketMQSourcePartitionOffset(messageQueue: MessageQueue, queueOffset: Long) extends PartitionOffset /** Companion object of the [[RocketMQSourceOffset]] */ private[rocketmq] object RocketMQSourceOffset { def getPartitionOffsets(offset: Offset): Map[MessageQueue, Long] = { offset match { case o: RocketMQSourceOffset => o.queueToOffsets case so: SerializedOffset => RocketMQSourceOffset(so).queueToOffsets case _ => throw new IllegalArgumentException( s"Invalid conversion from offset of ${offset.getClass} to RocketMQSourceOffset") } } /** * Returns [[RocketMQSourceOffset]] from a variable sequence of (topic, brokerName, queueId, offset) * tuples. */ def apply(offsetTuples: (String, String, Int, Long)*): RocketMQSourceOffset = { RocketMQSourceOffset(offsetTuples.map { case(t, b, q, o) => (new MessageQueue(t, b, q), o) }.toMap) } /** * Returns [[RocketMQSourceOffset]] from a JSON [[SerializedOffset]] */ def apply(offset: SerializedOffset): RocketMQSourceOffset = RocketMQSourceOffset(JsonUtils.partitionOffsets(offset.json)) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-3.0/org/apache/spark/sql/rocketmq/RocketMQSourceRDD.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSourceRDD.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import org.apache.rocketmq.common.message.{MessageExt, MessageQueue} import org.apache.spark.partial.{BoundedDouble, PartialResult} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.util.NextIterator import org.apache.spark.{Partition, SparkContext, TaskContext} import java.{util => ju} import scala.collection.mutable.ArrayBuffer /** Offset range that one partition of the RocketMQSourceRDD has to read */ private[rocketmq] case class RocketMQSourceRDDOffsetRange( messageQueue: MessageQueue, fromOffset: Long, untilOffset: Long, preferredLoc: Option[String]) { def size: Long = untilOffset - fromOffset } /** Partition of the RocketMQSourceRDD */ private[rocketmq] case class RocketMQSourceRDDPartition(index: Int, offsetRange: RocketMQSourceRDDOffsetRange) extends Partition /** * An RDD that reads data from RocketMQ based on offset ranges across multiple partitions. * Additionally, it allows preferred locations to be set for each topic + partition, so that * the [[RocketMQSource]] can ensure the same executor always reads the same topic + partition * and cached RocketMQConsumers (see [[CachedRocketMQConsumer]] can be used read data efficiently. * * @param sc the [[SparkContext]] * @param executorRocketMQParams RocketMQ configuration for creating RocketMQConsumer on the executors * @param offsetRanges Offset ranges that define the RocketMQ data belonging to this RDD */ private[rocketmq] class RocketMQSourceRDD( sc: SparkContext, executorRocketMQParams: ju.Map[String, String], offsetRanges: Seq[RocketMQSourceRDDOffsetRange], pollTimeoutMs: Long, failOnDataLoss: Boolean, reuseRocketMQConsumer: Boolean) extends RDD[MessageExt](sc, Nil) { override def persist(newLevel: StorageLevel): this.type = { logError("RocketMQ ConsumerRecord is not serializable. " + "Use .map to extract fields before calling .persist or .window") super.persist(newLevel) } override def getPartitions: Array[Partition] = { offsetRanges.zipWithIndex.map { case (o, i) => RocketMQSourceRDDPartition(i, o) }.toArray } override def count(): Long = offsetRanges.map(_.size).sum override def countApprox(timeout: Long, confidence: Double): PartialResult[BoundedDouble] = { val c = count() new PartialResult(new BoundedDouble(c, 1.0, c, c), true) } override def isEmpty(): Boolean = count == 0L override def take(num: Int): Array[MessageExt] = { val nonEmptyPartitions = this.partitions .map(_.asInstanceOf[RocketMQSourceRDDPartition]) .filter(_.offsetRange.size > 0) if (num < 1 || nonEmptyPartitions.isEmpty) { return new Array[MessageExt](0) } // Determine in advance how many messages need to be taken from each partition val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) => val remain = num - result.values.sum if (remain > 0) { val taken = Math.min(remain, part.offsetRange.size) result + (part.index -> taken.toInt) } else { result } } val buf = new ArrayBuffer[MessageExt] val res = context.runJob( this, (tc: TaskContext, it: Iterator[MessageExt]) => it.take(parts(tc.partitionId())).toArray, parts.keys.toArray ) res.foreach(buf ++= _) buf.toArray } override def getPreferredLocations(split: Partition): Seq[String] = { val part = split.asInstanceOf[RocketMQSourceRDDPartition] part.offsetRange.preferredLoc.map(Seq(_)).getOrElse(Seq.empty) } override def compute( thePart: Partition, context: TaskContext): Iterator[MessageExt] = { val sourcePartition = thePart.asInstanceOf[RocketMQSourceRDDPartition] val consumer = if (!reuseRocketMQConsumer) { CachedRocketMQConsumer.getOrCreate(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } else { CachedRocketMQConsumer.createUncached(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } val range = resolveRange(consumer, sourcePartition.offsetRange) assert( range.fromOffset <= range.untilOffset, s"Beginning offset ${range.fromOffset} is after the ending offset ${range.untilOffset} for " + s"${range.messageQueue}. You either provided an invalid fromOffset, or the RocketMQ topic has been damaged") if (range.fromOffset == range.untilOffset) { logInfo(s"Beginning offset ${range.fromOffset} is the same as ending offset, " + s"skipping ${range.messageQueue}") Iterator.empty } else { val underlying = new NextIterator[MessageExt]() { private var requestOffset = range.fromOffset override def getNext(): MessageExt = { if (requestOffset >= range.untilOffset) { // Processed all offsets in this partition. finished = true null } else { val r = consumer.get(requestOffset, range.untilOffset, pollTimeoutMs, failOnDataLoss) if (r == null) { // Losing some data. Skip the rest offsets in this partition. finished = true null } else { requestOffset = r.getQueueOffset + 1 // The MessageExt structure does not contains any field of `brokerName`, so put one into properties r.putUserProperty(RocketMQSource.PROP_BROKER_NAME, sourcePartition.offsetRange.messageQueue.getBrokerName) r } } } override protected def close(): Unit = { if (!reuseRocketMQConsumer) { consumer.close() } else { CachedRocketMQConsumer.releaseConsumer(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } } } // Release consumer, either by removing it or indicating we're no longer using it context.addTaskCompletionListener[Unit] { _ => underlying.closeIfNeeded() } underlying } } /** * Resolve the EARLIEST/LATEST placeholder in range * @return the range with actual boundary */ private def resolveRange(consumer: CachedRocketMQConsumer, range: RocketMQSourceRDDOffsetRange) = { if (range.fromOffset < 0 || range.untilOffset < 0) { // Late bind the offset range val availableOffsetRange = consumer.getAvailableOffsetRange() val fromOffset = if (range.fromOffset < 0) { assert(range.fromOffset == RocketMQOffsetRangeLimit.EARLIEST, s"earliest offset ${range.fromOffset} does not equal ${RocketMQOffsetRangeLimit.EARLIEST}") availableOffsetRange.earliest } else { range.fromOffset } val untilOffset = if (range.untilOffset < 0) { assert(range.untilOffset == RocketMQOffsetRangeLimit.LATEST, s"latest offset ${range.untilOffset} does not equal ${RocketMQOffsetRangeLimit.LATEST}") availableOffsetRange.latest } else { range.untilOffset } RocketMQSourceRDDOffsetRange(range.messageQueue, fromOffset, untilOffset, range.preferredLoc) } else { range } } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-3.0/org/apache/spark/streaming/RocketMqRDD.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.streaming import org.apache.rocketmq.common.message.MessageExt import org.apache.rocketmq.spark._ import org.apache.spark.partial.{BoundedDouble, PartialResult} import org.apache.spark.rdd.RDD import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.storage.StorageLevel import org.apache.spark.{Partition, SparkContext, TaskContext} import java.{util => ju} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer /** * A batch-oriented interface for consuming from RocketMq. * Starting and ending offsets are specified in advance, * so that you can control exactly-once semantics. * @param groupId it is for rocketMq for identifying the consumer * @param optionParams the configs * @param offsetRanges offset ranges that define the RocketMq data belonging to this RDD * @param preferredHosts map from TopicQueueId to preferred host for processing that partition. * In most cases, use [[LocationStrategy.PreferConsistent]] * @param useConsumerCache useConsumerCache whether to use a consumer from a per-jvm cache */ class RocketMqRDD ( sc: SparkContext, val groupId: String, val optionParams: ju.Map[String, String], val offsetRanges: ju.Map[TopicQueueId, Array[OffsetRange]], val preferredHosts: ju.Map[TopicQueueId, String], val useConsumerCache: Boolean )extends RDD[MessageExt](sc, Nil) with HasOffsetRanges{ private val cacheInitialCapacity = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_INIT_CAPACITY, "16").toInt private val cacheMaxCapacity = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_MAX_CAPACITY, "64").toInt private val cacheLoadFactor = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_LOAD_FACTOR, "0.75").toFloat override def persist(newLevel: StorageLevel): this.type = { super.persist(newLevel) } override def getPartitions: Array[Partition] = { offsetRanges.asScala.toArray.zipWithIndex.map{ case ((first, second), i) => new RocketMqRDDPartition(i, first.topic, first.queueId, second) }.toArray } override def count(): Long = offsetRanges.asScala.map(_._2.map(_.count).sum).sum override def countApprox( timeout: Long, confidence: Double = 0.95 ): PartialResult[BoundedDouble] = { val c = count new PartialResult(new BoundedDouble(c, 1.0, c, c), true) } override def isEmpty(): Boolean = count == 0L override def take(num: Int): Array[MessageExt] = { val nonEmptyPartitions = this.partitions .map(_.asInstanceOf[RocketMqRDDPartition]) .filter(_.count > 0) if (num < 1 || nonEmptyPartitions.isEmpty) { return new Array[MessageExt](0) } // Determine in advance how many messages need to be taken from each partition val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) => val remain = num - result.values.sum if (remain > 0) { val taken = Math.min(remain, part.count) result + (part.index -> taken.toInt) } else { result } } val buf = new ArrayBuffer[MessageExt] val res = context.runJob( this, (tc: TaskContext, it: Iterator[MessageExt]) => it.take(parts(tc.partitionId)).toArray, parts.keys.toArray ) res.foreach(buf ++= _) buf.toArray } private def executors(): Array[ExecutorCacheTaskLocation] = { val bm = sparkContext.env.blockManager bm.master.getPeers(bm.blockManagerId).toArray .map(x => ExecutorCacheTaskLocation(x.host, x.executorId)) .sortWith(compareExecutors) } private def compareExecutors( a: ExecutorCacheTaskLocation, b: ExecutorCacheTaskLocation): Boolean = if (a.host == b.host) { a.executorId > b.executorId } else { a.host > b.host } /** * Non-negative modulus, from java 8 math */ private def floorMod(a: Int, b: Int): Int = ((a % b) + b) % b protected override def getPreferredLocations(thePart: Partition): Seq[String] = { // The intention is best-effort consistent executor for a given topic partition, // so that caching consumers can be effective. val part = thePart.asInstanceOf[RocketMqRDDPartition] val allExecs = executors() val tp = part.topicQueueId() val prefHost = preferredHosts.get(tp) val prefExecs = if (null == prefHost) allExecs else allExecs.filter(_.host == prefHost) val execs = if (prefExecs.isEmpty) allExecs else prefExecs if (execs.isEmpty) { Seq() } else { // execs is sorted, tp.hashCode depends only on topic and partition, so consistent index val index = this.floorMod(tp.hashCode, execs.length) val chosen = execs(index) Seq(chosen.toString) } } private def errBeginAfterEnd(part: RocketMqRDDPartition): String = s"Beginning offset is after the ending offset ${part.partitionOffsetRanges.mkString(",")} " + s"for topic ${part.topic} partition ${part.index}. " + "You either provided an invalid fromOffset, or the Kafka topic has been damaged" override def compute(thePart: Partition, context: TaskContext): Iterator[MessageExt] = { val part = thePart.asInstanceOf[RocketMqRDDPartition] val count = part.count() assert(count >= 0, errBeginAfterEnd(part)) if (count == 0) { logInfo(s"Beginning offset is the same as ending offset " + s"skipping ${part.topic} ${part.queueId}") Iterator.empty } else { new RocketMqRDDIterator(part, context) } } /** * An iterator that fetches messages directly from rocketmq for the offsets in partition. * Uses a cached consumer where possible to take advantage of prefetching */ private class RocketMqRDDIterator( part: RocketMqRDDPartition, context: TaskContext) extends Iterator[MessageExt] { logDebug(s"Computing topic ${part.topic}, queueId ${part.queueId} " + s"offsets ${part.partitionOffsetRanges.mkString(",")}") context.addTaskCompletionListener[Unit]{ context => closeIfNeeded() } val consumer = if (useConsumerCache) { CachedMQConsumer.init(cacheInitialCapacity, cacheMaxCapacity, cacheLoadFactor) if (context.attemptNumber > 5) { // just in case the prior attempt failures were cache related CachedMQConsumer.remove(groupId, part.topic, part.queueId, part.brokerNames) } CachedMQConsumer.getOrCreate(groupId, part.topic, part.queueId, part.brokerNames, optionParams) } else { CachedMQConsumer.getUncached(groupId, part.topic, part.queueId, part.brokerNames, optionParams) } var logicTotalOffset = 0 val totalSum = part.partitionOffsetRanges.map(_.count).sum var index = 0 var requestOffset = part.partitionOffsetRanges.apply(index).fromOffset def closeIfNeeded(): Unit = { if (!useConsumerCache && consumer != null) { consumer.client.shutdown } } override def hasNext(): Boolean = { totalSum > logicTotalOffset } override def next(): MessageExt = { assert(hasNext(), "Can't call getNext() once untilOffset has been reached") val queueRange = part.partitionOffsetRanges.apply(index) val r = consumer.get(queueRange.brokerName, requestOffset) if (queueRange.untilOffset > (requestOffset + 1)) requestOffset +=1 else { index +=1 if (part.partitionOffsetRanges.length > index) requestOffset = part.partitionOffsetRanges.apply(index).fromOffset } logicTotalOffset += 1 r } } private[RocketMqRDD] type OffsetRangeTuple = (String, Int) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-3.1/org/apache/spark/sql/rocketmq/RocketMQSource.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSource.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ * 2. Schema of output dataframe adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import java.io._ import java.nio.charset.StandardCharsets import java.{util => ju} import org.apache.commons.io.IOUtils import org.apache.rocketmq.common.message.MessageQueue import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.rocketmq.RocketMQSource._ import org.apache.spark.sql.types.{StructField, _} import org.apache.spark.unsafe.types.UTF8String /** * A [[Source]] that reads data from RocketMQ using the following design. * * - The [[RocketMQSourceOffset]] is the custom [[Offset]] defined for this source that contains * a map of MessageQueue -> offset. Note that this offset is 1 + (available offset). For * example if the last record in a RocketMQ topic "t", partition 2 is offset 5, then * RocketMQSourceOffset will contain MessageQueue("t", 2) -> 6. This is done keep it consistent * with the semantics of `MQPullConsumer.fetchConsumeOffset()`. * * - The [[RocketMQSource]] written to do the following. * * - As soon as the source is created, the pre-configured [[RocketMQOffsetReader]] * is used to query the initial offsets that this source should * start reading from. This is used to create the first batch. * * - `getOffset()` uses the [[RocketMQOffsetReader]] to query the latest * available offsets, which are returned as a [[RocketMQSourceOffset]]. * * - `getBatch()` returns a DF that reads from the 'start offset' until the 'end offset' in * for each partition. The end offset is excluded to be consistent with the semantics of * [[RocketMQSourceOffset]] and `MQPullConsumer.fetchConsumeOffset()`. * * - The DF returned is based on [[RocketMQSourceRDD]] which is constructed such that the * data from RocketMQ topic + partition is consistently read by the same executors across * batches, and cached RocketMQConsumers in the executors can be reused efficiently. See the * docs on [[RocketMQSourceRDD]] for more details. * * Zero data lost is not guaranteed when topics are deleted. If zero data lost is critical, the user * must make sure all messages in a topic have been processed when deleting a topic. */ private class RocketMQSource( sqlContext: SQLContext, offsetReader: RocketMQOffsetReader, executorRocketMQParams: ju.Map[String, String], sourceOptions: Map[String, String], metadataPath: String, startingOffsets: RocketMQOffsetRangeLimit, failOnDataLoss: Boolean) extends Source with Logging { private val sc = sqlContext.sparkContext private val pollTimeoutMs = sourceOptions.getOrElse( RocketMQConf.PULL_TIMEOUT_MS, sc.conf.getTimeAsMs("spark.network.timeout", "120s").toString ).toLong private val maxOffsetsPerTrigger = sourceOptions.get("maxOffsetsPerTrigger").map(_.toLong) /** * Lazily initialize `initialPartitionOffsets` to make sure that `RocketMQConsumer.pull` is only * called in StreamExecutionThread. */ private lazy val initialPartitionOffsets = { val metadataLog = new HDFSMetadataLog[RocketMQSourceOffset](sqlContext.sparkSession, metadataPath) { override def serialize(metadata: RocketMQSourceOffset, out: OutputStream): Unit = { out.write(0) // A zero byte is written to support Spark 2.1.0 (SPARK-19517) val writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8)) writer.write("v" + VERSION + "\n") writer.write(metadata.json) writer.flush() } override def deserialize(in: InputStream): RocketMQSourceOffset = { in.read() // A zero byte is read to support Spark 2.1.0 (SPARK-19517) val content = IOUtils.toString(new InputStreamReader(in, StandardCharsets.UTF_8)) // HDFSMetadataLog guarantees that it never creates a partial file. assert(content.length != 0) if (content(0) == 'v') { val indexOfNewLine = content.indexOf("\n") if (indexOfNewLine > 0) { val version = validateVersion(content.substring(0, indexOfNewLine), VERSION) RocketMQSourceOffset(SerializedOffset(content.substring(indexOfNewLine + 1))) } else { throw new IllegalStateException( s"Log file was malformed: failed to detect the log file version line.") } } else { // The log was generated by Spark 2.1.0 RocketMQSourceOffset(SerializedOffset(content)) } } } metadataLog.get(0).getOrElse { val offsets = startingOffsets match { case EarliestOffsetRangeLimit => RocketMQSourceOffset(offsetReader.fetchEarliestOffsets()) case LatestOffsetRangeLimit => RocketMQSourceOffset(offsetReader.fetchLatestOffsets()) case SpecificOffsetRangeLimit(p) => offsetReader.fetchSpecificOffsets(p, reportDataLoss) } metadataLog.add(0, offsets) logInfo(s"Initial offsets: $offsets") offsets }.queueToOffsets } private var currentPartitionOffsets: Option[Map[MessageQueue, Long]] = None override def schema: StructType = RocketMQSource.schema /** Returns the maximum available offset for this source. */ override def getOffset: Option[Offset] = { // Make sure initialPartitionOffsets is initialized initialPartitionOffsets val latest = offsetReader.fetchLatestOffsets() val offsets = maxOffsetsPerTrigger match { case None => latest case Some(limit) if currentPartitionOffsets.isEmpty => rateLimit(limit, initialPartitionOffsets, latest) case Some(limit) => rateLimit(limit, currentPartitionOffsets.get, latest) } currentPartitionOffsets = Some(offsets) logDebug(s"GetOffset: ${offsets.toSeq.map(_.toString).sorted}") Some(RocketMQSourceOffset(offsets)) } /** Proportionally distribute limit number of offsets among message queues */ private def rateLimit( limit: Long, from: Map[MessageQueue, Long], until: Map[MessageQueue, Long]): Map[MessageQueue, Long] = { val fromNew = offsetReader.fetchEarliestOffsets(until.keySet.diff(from.keySet).toSeq) val sizes = until.flatMap { case (tp, end) => // If begin isn't defined, something's wrong, but let alert logic in getBatch handle it from.get(tp).orElse(fromNew.get(tp)).flatMap { begin => val size = end - begin logDebug(s"rateLimit $tp size is $size") if (size > 0) Some(tp -> size) else None } } val total = sizes.values.sum.toDouble if (total < 1) { until } else { until.map { case (tp, end) => tp -> sizes.get(tp).map { size => val begin = from.get(tp).getOrElse(fromNew(tp)) val prorate = limit * (size / total) logDebug(s"rateLimit $tp prorated amount is $prorate") // Don't completely starve small topicpartitions val off = begin + (if (prorate < 1) Math.ceil(prorate) else Math.floor(prorate)).toLong logDebug(s"rateLimit $tp new offset is $off") // Paranoia, make sure not to return an offset that's past end Math.min(end, off) }.getOrElse(end) } } } /** * Returns the data that is between the offsets * [`start.get.partitionToOffsets`, `end.partitionToOffsets`), i.e. end.partitionToOffsets is * exclusive. */ override def getBatch(start: Option[Offset], end: Offset): DataFrame = { // Make sure initialPartitionOffsets is initialized initialPartitionOffsets logInfo(s"GetBatch called with start = $start, end = $end") val untilPartitionOffsets = RocketMQSourceOffset.getPartitionOffsets(end) // On recovery, getBatch will get called before getOffset if (currentPartitionOffsets.isEmpty) { currentPartitionOffsets = Some(untilPartitionOffsets) } if (start.isDefined && start.get == end) { return sqlContext.internalCreateDataFrame( sqlContext.sparkContext.emptyRDD, schema, isStreaming = true) } val fromPartitionOffsets = start match { case Some(prevBatchEndOffset) => RocketMQSourceOffset.getPartitionOffsets(prevBatchEndOffset) case None => initialPartitionOffsets } // Find the new partitions, and get their earliest offsets val newPartitions = untilPartitionOffsets.keySet.diff(fromPartitionOffsets.keySet) val newPartitionOffsets = offsetReader.fetchEarliestOffsets(newPartitions.toSeq) if (newPartitionOffsets.keySet != newPartitions) { // We cannot get from offsets for some partitions. It means they got deleted. val deletedPartitions = newPartitions.diff(newPartitionOffsets.keySet) reportDataLoss( s"Cannot find earliest offsets of $deletedPartitions. Some data may have been missed") } logInfo(s"Partitions added: $newPartitionOffsets") newPartitionOffsets.filter(_._2 != 0).foreach { case (p, o) => reportDataLoss( s"Added partition $p starts from $o instead of 0. Some data may have been missed") } val deletedPartitions = fromPartitionOffsets.keySet.diff(untilPartitionOffsets.keySet) if (deletedPartitions.nonEmpty) { reportDataLoss(s"$deletedPartitions are gone. Some data may have been missed") } // Use the until partitions to calculate offset ranges to ignore partitions that have // been deleted val topicPartitions = untilPartitionOffsets.keySet.filter { tp => // Ignore partitions that we don't know the from offsets. newPartitionOffsets.contains(tp) || fromPartitionOffsets.contains(tp) }.toSeq logDebug("TopicPartitions: " + topicPartitions.mkString(", ")) val sortedExecutors = getSortedExecutorList(sc) val numExecutors = sortedExecutors.length logDebug("Sorted executors: " + sortedExecutors.mkString(", ")) // Calculate offset ranges val offsetRanges = topicPartitions.map { tp => val fromOffset = fromPartitionOffsets.getOrElse(tp, { newPartitionOffsets.getOrElse(tp, { // This should not happen since newPartitionOffsets contains all partitions not in // fromPartitionOffsets throw new IllegalStateException(s"$tp doesn't have a from offset") }) }) val untilOffset = untilPartitionOffsets(tp) val preferredLoc = if (numExecutors > 0) { // This allows cached RocketMQConsumers in the executors to be re-used to read the same // partition in every batch. Some(sortedExecutors(Math.floorMod(tp.hashCode, numExecutors))) } else None RocketMQSourceRDDOffsetRange(tp, fromOffset, untilOffset, preferredLoc) }.filter { range => if (range.untilOffset < range.fromOffset) { reportDataLoss(s"Partition ${range.messageQueue}'s offset was changed from " + s"${range.fromOffset} to ${range.untilOffset}, some data may have been missed") false } else { true } }.toArray // Create an RDD that reads from RocketMQ and get the (key, value) pair as byte arrays. val rdd = new RocketMQSourceRDD( sc, executorRocketMQParams, offsetRanges, pollTimeoutMs, failOnDataLoss, reuseRocketMQConsumer = true).map { cr => // Remove the `brokerName` property which was added by us. See `RocketMQSourceRDD.compute` val brokerName = cr.getProperties.remove(RocketMQSource.PROP_BROKER_NAME) InternalRow( UTF8String.fromString(cr.getTopic), // topic cr.getFlag, // flag cr.getBody, // body UTF8String.fromString(JsonUtils.messageProperties(cr.getProperties)), // properties UTF8String.fromString(brokerName), // brokerName cr.getQueueId, // queueId cr.getQueueOffset, // queueOffset DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.getBornTimestamp)), // bornTimestamp DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.getStoreTimestamp)) // storeTimestamp ) } logInfo("GetBatch generating RDD of offset range: " + offsetRanges.sortBy(_.messageQueue.toString).mkString(", ")) sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true) } /** Stop this source and free any resources it has allocated. */ override def stop(): Unit = synchronized { offsetReader.close() } override def toString: String = s"RocketMQSource[$offsetReader]" /** * If `failOnDataLoss` is true, this method will throw an `IllegalStateException`. * Otherwise, just log a warning. */ private def reportDataLoss(message: String): Unit = { if (failOnDataLoss) { throw new IllegalStateException(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE") } else { logWarning(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE") } } } /** Companion object for the [[RocketMQSource]]. */ private object RocketMQSource { val INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE = """ |Some data may have been lost because they are not available in RocketMQ any more; either the | data was aged out by RocketMQ or the topic may have been deleted before all the data in the | topic was processed. If you want your streaming query to fail on such cases, set the source | option "failOnDataLoss" to "true". """.stripMargin val INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE = """ |Some data may have been lost because they are not available in RocketMQ any more; either the | data was aged out by RocketMQ or the topic may have been deleted before all the data in the | topic was processed. If you don't want your streaming query to fail on such cases, set the | source option "failOnDataLoss" to "false". """.stripMargin val VERSION = 1 val PROP_BROKER_NAME = "_brokerName" def getSortedExecutorList(sc: SparkContext): Array[String] = { val bm = sc.env.blockManager bm.master.getPeers(bm.blockManagerId).toArray .map(x => ExecutorCacheTaskLocation(x.host, x.executorId)) .sortWith(compare) .map(_.toString) } private def compare(a: ExecutorCacheTaskLocation, b: ExecutorCacheTaskLocation): Boolean = { if (a.host == b.host) { a.executorId > b.executorId } else { a.host > b.host } } def schema: StructType = StructType(Seq( // fields of `Message` StructField("topic", StringType), StructField("flag", IntegerType), StructField("body", BinaryType), StructField("properties", StringType), // fields of `MessageExt` StructField("brokerName", StringType), StructField("queueId", IntegerType), StructField("queueOffset", LongType), StructField("bornTimestamp", TimestampType), StructField("storeTimestamp", TimestampType) )) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-3.1/org/apache/spark/sql/rocketmq/RocketMQSourceOffset.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSourceOffset.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import org.apache.rocketmq.common.message.MessageQueue import org.apache.spark.sql.connector.read.streaming.PartitionOffset import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset} /** * An [[Offset]] for the [[RocketMQSource]]. This one tracks all partitions of subscribed topics and * their offsets. */ private[rocketmq] case class RocketMQSourceOffset(queueToOffsets: Map[MessageQueue, Long]) extends Offset { override val json = JsonUtils.partitionOffsets(queueToOffsets) } private[rocketmq] case class RocketMQSourcePartitionOffset(messageQueue: MessageQueue, queueOffset: Long) extends PartitionOffset /** Companion object of the [[RocketMQSourceOffset]] */ private[rocketmq] object RocketMQSourceOffset { def getPartitionOffsets(offset: Offset): Map[MessageQueue, Long] = { offset match { case o: RocketMQSourceOffset => o.queueToOffsets case so: SerializedOffset => RocketMQSourceOffset(so).queueToOffsets case _ => throw new IllegalArgumentException( s"Invalid conversion from offset of ${offset.getClass} to RocketMQSourceOffset") } } /** * Returns [[RocketMQSourceOffset]] from a variable sequence of (topic, brokerName, queueId, offset) * tuples. */ def apply(offsetTuples: (String, String, Int, Long)*): RocketMQSourceOffset = { RocketMQSourceOffset(offsetTuples.map { case(t, b, q, o) => (new MessageQueue(t, b, q), o) }.toMap) } /** * Returns [[RocketMQSourceOffset]] from a JSON [[SerializedOffset]] */ def apply(offset: SerializedOffset): RocketMQSourceOffset = RocketMQSourceOffset(JsonUtils.partitionOffsets(offset.json)) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-3.1/org/apache/spark/sql/rocketmq/RocketMQSourceRDD.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSourceRDD.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import org.apache.rocketmq.common.message.{MessageExt, MessageQueue} import org.apache.spark.partial.{BoundedDouble, PartialResult} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.util.NextIterator import org.apache.spark.{Partition, SparkContext, TaskContext} import java.{util => ju} import scala.collection.mutable.ArrayBuffer /** Offset range that one partition of the RocketMQSourceRDD has to read */ private[rocketmq] case class RocketMQSourceRDDOffsetRange( messageQueue: MessageQueue, fromOffset: Long, untilOffset: Long, preferredLoc: Option[String]) { def size: Long = untilOffset - fromOffset } /** Partition of the RocketMQSourceRDD */ private[rocketmq] case class RocketMQSourceRDDPartition(index: Int, offsetRange: RocketMQSourceRDDOffsetRange) extends Partition /** * An RDD that reads data from RocketMQ based on offset ranges across multiple partitions. * Additionally, it allows preferred locations to be set for each topic + partition, so that * the [[RocketMQSource]] can ensure the same executor always reads the same topic + partition * and cached RocketMQConsumers (see [[CachedRocketMQConsumer]] can be used read data efficiently. * * @param sc the [[SparkContext]] * @param executorRocketMQParams RocketMQ configuration for creating RocketMQConsumer on the executors * @param offsetRanges Offset ranges that define the RocketMQ data belonging to this RDD */ private[rocketmq] class RocketMQSourceRDD( sc: SparkContext, executorRocketMQParams: ju.Map[String, String], offsetRanges: Seq[RocketMQSourceRDDOffsetRange], pollTimeoutMs: Long, failOnDataLoss: Boolean, reuseRocketMQConsumer: Boolean) extends RDD[MessageExt](sc, Nil) { override def persist(newLevel: StorageLevel): this.type = { logError("RocketMQ ConsumerRecord is not serializable. " + "Use .map to extract fields before calling .persist or .window") super.persist(newLevel) } override def getPartitions: Array[Partition] = { offsetRanges.zipWithIndex.map { case (o, i) => RocketMQSourceRDDPartition(i, o) }.toArray } override def count(): Long = offsetRanges.map(_.size).sum override def countApprox(timeout: Long, confidence: Double): PartialResult[BoundedDouble] = { val c = count() new PartialResult(new BoundedDouble(c, 1.0, c, c), true) } override def isEmpty(): Boolean = count == 0L override def take(num: Int): Array[MessageExt] = { val nonEmptyPartitions = this.partitions .map(_.asInstanceOf[RocketMQSourceRDDPartition]) .filter(_.offsetRange.size > 0) if (num < 1 || nonEmptyPartitions.isEmpty) { return new Array[MessageExt](0) } // Determine in advance how many messages need to be taken from each partition val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) => val remain = num - result.values.sum if (remain > 0) { val taken = Math.min(remain, part.offsetRange.size) result + (part.index -> taken.toInt) } else { result } } val buf = new ArrayBuffer[MessageExt] val res = context.runJob( this, (tc: TaskContext, it: Iterator[MessageExt]) => it.take(parts(tc.partitionId())).toArray, parts.keys.toArray ) res.foreach(buf ++= _) buf.toArray } override def getPreferredLocations(split: Partition): Seq[String] = { val part = split.asInstanceOf[RocketMQSourceRDDPartition] part.offsetRange.preferredLoc.map(Seq(_)).getOrElse(Seq.empty) } override def compute( thePart: Partition, context: TaskContext): Iterator[MessageExt] = { val sourcePartition = thePart.asInstanceOf[RocketMQSourceRDDPartition] val consumer = if (!reuseRocketMQConsumer) { CachedRocketMQConsumer.getOrCreate(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } else { CachedRocketMQConsumer.createUncached(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } val range = resolveRange(consumer, sourcePartition.offsetRange) assert( range.fromOffset <= range.untilOffset, s"Beginning offset ${range.fromOffset} is after the ending offset ${range.untilOffset} for " + s"${range.messageQueue}. You either provided an invalid fromOffset, or the RocketMQ topic has been damaged") if (range.fromOffset == range.untilOffset) { logInfo(s"Beginning offset ${range.fromOffset} is the same as ending offset, " + s"skipping ${range.messageQueue}") Iterator.empty } else { val underlying = new NextIterator[MessageExt]() { private var requestOffset = range.fromOffset override def getNext(): MessageExt = { if (requestOffset >= range.untilOffset) { // Processed all offsets in this partition. finished = true null } else { val r = consumer.get(requestOffset, range.untilOffset, pollTimeoutMs, failOnDataLoss) if (r == null) { // Losing some data. Skip the rest offsets in this partition. finished = true null } else { requestOffset = r.getQueueOffset + 1 // The MessageExt structure does not contains any field of `brokerName`, so put one into properties r.putUserProperty(RocketMQSource.PROP_BROKER_NAME, sourcePartition.offsetRange.messageQueue.getBrokerName) r } } } override protected def close(): Unit = { if (!reuseRocketMQConsumer) { consumer.close() } else { CachedRocketMQConsumer.releaseConsumer(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } } } // Release consumer, either by removing it or indicating we're no longer using it context.addTaskCompletionListener[Unit] { _ => underlying.closeIfNeeded() } underlying } } /** * Resolve the EARLIEST/LATEST placeholder in range * @return the range with actual boundary */ private def resolveRange(consumer: CachedRocketMQConsumer, range: RocketMQSourceRDDOffsetRange) = { if (range.fromOffset < 0 || range.untilOffset < 0) { // Late bind the offset range val availableOffsetRange = consumer.getAvailableOffsetRange() val fromOffset = if (range.fromOffset < 0) { assert(range.fromOffset == RocketMQOffsetRangeLimit.EARLIEST, s"earliest offset ${range.fromOffset} does not equal ${RocketMQOffsetRangeLimit.EARLIEST}") availableOffsetRange.earliest } else { range.fromOffset } val untilOffset = if (range.untilOffset < 0) { assert(range.untilOffset == RocketMQOffsetRangeLimit.LATEST, s"latest offset ${range.untilOffset} does not equal ${RocketMQOffsetRangeLimit.LATEST}") availableOffsetRange.latest } else { range.untilOffset } RocketMQSourceRDDOffsetRange(range.messageQueue, fromOffset, untilOffset, range.preferredLoc) } else { range } } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-3.1/org/apache/spark/streaming/RocketMqRDD.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.streaming import org.apache.rocketmq.common.message.MessageExt import org.apache.rocketmq.spark._ import org.apache.spark.partial.{BoundedDouble, PartialResult} import org.apache.spark.rdd.RDD import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.storage.StorageLevel import org.apache.spark.{Partition, SparkContext, TaskContext} import java.{util => ju} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer /** * A batch-oriented interface for consuming from RocketMq. * Starting and ending offsets are specified in advance, * so that you can control exactly-once semantics. * @param groupId it is for rocketMq for identifying the consumer * @param optionParams the configs * @param offsetRanges offset ranges that define the RocketMq data belonging to this RDD * @param preferredHosts map from TopicQueueId to preferred host for processing that partition. * In most cases, use [[LocationStrategy.PreferConsistent]] * @param useConsumerCache useConsumerCache whether to use a consumer from a per-jvm cache */ class RocketMqRDD ( sc: SparkContext, val groupId: String, val optionParams: ju.Map[String, String], val offsetRanges: ju.Map[TopicQueueId, Array[OffsetRange]], val preferredHosts: ju.Map[TopicQueueId, String], val useConsumerCache: Boolean )extends RDD[MessageExt](sc, Nil) with HasOffsetRanges{ private val cacheInitialCapacity = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_INIT_CAPACITY, "16").toInt private val cacheMaxCapacity = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_MAX_CAPACITY, "64").toInt private val cacheLoadFactor = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_LOAD_FACTOR, "0.75").toFloat override def persist(newLevel: StorageLevel): this.type = { super.persist(newLevel) } override def getPartitions: Array[Partition] = { offsetRanges.asScala.toArray.zipWithIndex.map{ case ((first, second), i) => new RocketMqRDDPartition(i, first.topic, first.queueId, second) }.toArray } override def count(): Long = offsetRanges.asScala.map(_._2.map(_.count).sum).sum override def countApprox( timeout: Long, confidence: Double = 0.95 ): PartialResult[BoundedDouble] = { val c = count new PartialResult(new BoundedDouble(c, 1.0, c, c), true) } override def isEmpty(): Boolean = count == 0L override def take(num: Int): Array[MessageExt] = { val nonEmptyPartitions = this.partitions .map(_.asInstanceOf[RocketMqRDDPartition]) .filter(_.count > 0) if (num < 1 || nonEmptyPartitions.isEmpty) { return new Array[MessageExt](0) } // Determine in advance how many messages need to be taken from each partition val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) => val remain = num - result.values.sum if (remain > 0) { val taken = Math.min(remain, part.count) result + (part.index -> taken.toInt) } else { result } } val buf = new ArrayBuffer[MessageExt] val res = context.runJob( this, (tc: TaskContext, it: Iterator[MessageExt]) => it.take(parts(tc.partitionId)).toArray, parts.keys.toArray ) res.foreach(buf ++= _) buf.toArray } private def executors(): Array[ExecutorCacheTaskLocation] = { val bm = sparkContext.env.blockManager bm.master.getPeers(bm.blockManagerId).toArray .map(x => ExecutorCacheTaskLocation(x.host, x.executorId)) .sortWith(compareExecutors) } private def compareExecutors( a: ExecutorCacheTaskLocation, b: ExecutorCacheTaskLocation): Boolean = if (a.host == b.host) { a.executorId > b.executorId } else { a.host > b.host } /** * Non-negative modulus, from java 8 math */ private def floorMod(a: Int, b: Int): Int = ((a % b) + b) % b protected override def getPreferredLocations(thePart: Partition): Seq[String] = { // The intention is best-effort consistent executor for a given topic partition, // so that caching consumers can be effective. val part = thePart.asInstanceOf[RocketMqRDDPartition] val allExecs = executors() val tp = part.topicQueueId() val prefHost = preferredHosts.get(tp) val prefExecs = if (null == prefHost) allExecs else allExecs.filter(_.host == prefHost) val execs = if (prefExecs.isEmpty) allExecs else prefExecs if (execs.isEmpty) { Seq() } else { // execs is sorted, tp.hashCode depends only on topic and partition, so consistent index val index = this.floorMod(tp.hashCode, execs.length) val chosen = execs(index) Seq(chosen.toString) } } private def errBeginAfterEnd(part: RocketMqRDDPartition): String = s"Beginning offset is after the ending offset ${part.partitionOffsetRanges.mkString(",")} " + s"for topic ${part.topic} partition ${part.index}. " + "You either provided an invalid fromOffset, or the Kafka topic has been damaged" override def compute(thePart: Partition, context: TaskContext): Iterator[MessageExt] = { val part = thePart.asInstanceOf[RocketMqRDDPartition] val count = part.count() assert(count >= 0, errBeginAfterEnd(part)) if (count == 0) { logInfo(s"Beginning offset is the same as ending offset " + s"skipping ${part.topic} ${part.queueId}") Iterator.empty } else { new RocketMqRDDIterator(part, context) } } /** * An iterator that fetches messages directly from rocketmq for the offsets in partition. * Uses a cached consumer where possible to take advantage of prefetching */ private class RocketMqRDDIterator( part: RocketMqRDDPartition, context: TaskContext) extends Iterator[MessageExt] { logDebug(s"Computing topic ${part.topic}, queueId ${part.queueId} " + s"offsets ${part.partitionOffsetRanges.mkString(",")}") context.addTaskCompletionListener[Unit]{ context => closeIfNeeded() } val consumer = if (useConsumerCache) { CachedMQConsumer.init(cacheInitialCapacity, cacheMaxCapacity, cacheLoadFactor) if (context.attemptNumber > 5) { // just in case the prior attempt failures were cache related CachedMQConsumer.remove(groupId, part.topic, part.queueId, part.brokerNames) } CachedMQConsumer.getOrCreate(groupId, part.topic, part.queueId, part.brokerNames, optionParams) } else { CachedMQConsumer.getUncached(groupId, part.topic, part.queueId, part.brokerNames, optionParams) } var logicTotalOffset = 0 val totalSum = part.partitionOffsetRanges.map(_.count).sum var index = 0 var requestOffset = part.partitionOffsetRanges.apply(index).fromOffset def closeIfNeeded(): Unit = { if (!useConsumerCache && consumer != null) { consumer.client.shutdown } } override def hasNext(): Boolean = { totalSum > logicTotalOffset } override def next(): MessageExt = { assert(hasNext(), "Can't call getNext() once untilOffset has been reached") val queueRange = part.partitionOffsetRanges.apply(index) val r = consumer.get(queueRange.brokerName, requestOffset) if (queueRange.untilOffset > (requestOffset + 1)) requestOffset +=1 else { index +=1 if (part.partitionOffsetRanges.length > index) requestOffset = part.partitionOffsetRanges.apply(index).fromOffset } logicTotalOffset += 1 r } } private[RocketMqRDD] type OffsetRangeTuple = (String, Int) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-3.2/org/apache/spark/sql/rocketmq/RocketMQSource.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSource.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ * 2. Schema of output dataframe adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import java.io._ import java.nio.charset.StandardCharsets import java.{util => ju} import org.apache.commons.io.IOUtils import org.apache.rocketmq.common.message.MessageQueue import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.rocketmq.RocketMQSource._ import org.apache.spark.sql.types.{StructField, _} import org.apache.spark.unsafe.types.UTF8String /** * A [[Source]] that reads data from RocketMQ using the following design. * * - The [[RocketMQSourceOffset]] is the custom [[Offset]] defined for this source that contains * a map of MessageQueue -> offset. Note that this offset is 1 + (available offset). For * example if the last record in a RocketMQ topic "t", partition 2 is offset 5, then * RocketMQSourceOffset will contain MessageQueue("t", 2) -> 6. This is done keep it consistent * with the semantics of `MQPullConsumer.fetchConsumeOffset()`. * * - The [[RocketMQSource]] written to do the following. * * - As soon as the source is created, the pre-configured [[RocketMQOffsetReader]] * is used to query the initial offsets that this source should * start reading from. This is used to create the first batch. * * - `getOffset()` uses the [[RocketMQOffsetReader]] to query the latest * available offsets, which are returned as a [[RocketMQSourceOffset]]. * * - `getBatch()` returns a DF that reads from the 'start offset' until the 'end offset' in * for each partition. The end offset is excluded to be consistent with the semantics of * [[RocketMQSourceOffset]] and `MQPullConsumer.fetchConsumeOffset()`. * * - The DF returned is based on [[RocketMQSourceRDD]] which is constructed such that the * data from RocketMQ topic + partition is consistently read by the same executors across * batches, and cached RocketMQConsumers in the executors can be reused efficiently. See the * docs on [[RocketMQSourceRDD]] for more details. * * Zero data lost is not guaranteed when topics are deleted. If zero data lost is critical, the user * must make sure all messages in a topic have been processed when deleting a topic. */ private class RocketMQSource( sqlContext: SQLContext, offsetReader: RocketMQOffsetReader, executorRocketMQParams: ju.Map[String, String], sourceOptions: Map[String, String], metadataPath: String, startingOffsets: RocketMQOffsetRangeLimit, failOnDataLoss: Boolean) extends Source with Logging { private val sc = sqlContext.sparkContext private val pollTimeoutMs = sourceOptions.getOrElse( RocketMQConf.PULL_TIMEOUT_MS, sc.conf.getTimeAsMs("spark.network.timeout", "120s").toString ).toLong private val maxOffsetsPerTrigger = sourceOptions.get("maxOffsetsPerTrigger").map(_.toLong) /** * Lazily initialize `initialPartitionOffsets` to make sure that `RocketMQConsumer.pull` is only * called in StreamExecutionThread. */ private lazy val initialPartitionOffsets = { val metadataLog = new HDFSMetadataLog[RocketMQSourceOffset](sqlContext.sparkSession, metadataPath) { override def serialize(metadata: RocketMQSourceOffset, out: OutputStream): Unit = { out.write(0) // A zero byte is written to support Spark 2.1.0 (SPARK-19517) val writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8)) writer.write("v" + VERSION + "\n") writer.write(metadata.json) writer.flush() } override def deserialize(in: InputStream): RocketMQSourceOffset = { in.read() // A zero byte is read to support Spark 2.1.0 (SPARK-19517) val content = IOUtils.toString(new InputStreamReader(in, StandardCharsets.UTF_8)) // HDFSMetadataLog guarantees that it never creates a partial file. assert(content.length != 0) if (content(0) == 'v') { val indexOfNewLine = content.indexOf("\n") if (indexOfNewLine > 0) { val version = validateVersion(content.substring(0, indexOfNewLine), VERSION) RocketMQSourceOffset(SerializedOffset(content.substring(indexOfNewLine + 1))) } else { throw new IllegalStateException( s"Log file was malformed: failed to detect the log file version line.") } } else { // The log was generated by Spark 2.1.0 RocketMQSourceOffset(SerializedOffset(content)) } } } metadataLog.get(0).getOrElse { val offsets = startingOffsets match { case EarliestOffsetRangeLimit => RocketMQSourceOffset(offsetReader.fetchEarliestOffsets()) case LatestOffsetRangeLimit => RocketMQSourceOffset(offsetReader.fetchLatestOffsets()) case SpecificOffsetRangeLimit(p) => offsetReader.fetchSpecificOffsets(p, reportDataLoss) } metadataLog.add(0, offsets) logInfo(s"Initial offsets: $offsets") offsets }.queueToOffsets } private var currentPartitionOffsets: Option[Map[MessageQueue, Long]] = None override def schema: StructType = RocketMQSource.schema /** Returns the maximum available offset for this source. */ override def getOffset: Option[Offset] = { // Make sure initialPartitionOffsets is initialized initialPartitionOffsets val latest = offsetReader.fetchLatestOffsets() val offsets = maxOffsetsPerTrigger match { case None => latest case Some(limit) if currentPartitionOffsets.isEmpty => rateLimit(limit, initialPartitionOffsets, latest) case Some(limit) => rateLimit(limit, currentPartitionOffsets.get, latest) } currentPartitionOffsets = Some(offsets) logDebug(s"GetOffset: ${offsets.toSeq.map(_.toString).sorted}") Some(RocketMQSourceOffset(offsets)) } /** Proportionally distribute limit number of offsets among message queues */ private def rateLimit( limit: Long, from: Map[MessageQueue, Long], until: Map[MessageQueue, Long]): Map[MessageQueue, Long] = { val fromNew = offsetReader.fetchEarliestOffsets(until.keySet.diff(from.keySet).toSeq) val sizes = until.flatMap { case (tp, end) => // If begin isn't defined, something's wrong, but let alert logic in getBatch handle it from.get(tp).orElse(fromNew.get(tp)).flatMap { begin => val size = end - begin logDebug(s"rateLimit $tp size is $size") if (size > 0) Some(tp -> size) else None } } val total = sizes.values.sum.toDouble if (total < 1) { until } else { until.map { case (tp, end) => tp -> sizes.get(tp).map { size => val begin = from.get(tp).getOrElse(fromNew(tp)) val prorate = limit * (size / total) logDebug(s"rateLimit $tp prorated amount is $prorate") // Don't completely starve small topicpartitions val off = begin + (if (prorate < 1) Math.ceil(prorate) else Math.floor(prorate)).toLong logDebug(s"rateLimit $tp new offset is $off") // Paranoia, make sure not to return an offset that's past end Math.min(end, off) }.getOrElse(end) } } } /** * Returns the data that is between the offsets * [`start.get.partitionToOffsets`, `end.partitionToOffsets`), i.e. end.partitionToOffsets is * exclusive. */ override def getBatch(start: Option[Offset], end: Offset): DataFrame = { // Make sure initialPartitionOffsets is initialized initialPartitionOffsets logInfo(s"GetBatch called with start = $start, end = $end") val untilPartitionOffsets = RocketMQSourceOffset.getPartitionOffsets(end) // On recovery, getBatch will get called before getOffset if (currentPartitionOffsets.isEmpty) { currentPartitionOffsets = Some(untilPartitionOffsets) } if (start.isDefined && start.get == end) { return sqlContext.internalCreateDataFrame( sqlContext.sparkContext.emptyRDD, schema, isStreaming = true) } val fromPartitionOffsets = start match { case Some(prevBatchEndOffset) => RocketMQSourceOffset.getPartitionOffsets(prevBatchEndOffset) case None => initialPartitionOffsets } // Find the new partitions, and get their earliest offsets val newPartitions = untilPartitionOffsets.keySet.diff(fromPartitionOffsets.keySet) val newPartitionOffsets = offsetReader.fetchEarliestOffsets(newPartitions.toSeq) if (newPartitionOffsets.keySet != newPartitions) { // We cannot get from offsets for some partitions. It means they got deleted. val deletedPartitions = newPartitions.diff(newPartitionOffsets.keySet) reportDataLoss( s"Cannot find earliest offsets of $deletedPartitions. Some data may have been missed") } logInfo(s"Partitions added: $newPartitionOffsets") newPartitionOffsets.filter(_._2 != 0).foreach { case (p, o) => reportDataLoss( s"Added partition $p starts from $o instead of 0. Some data may have been missed") } val deletedPartitions = fromPartitionOffsets.keySet.diff(untilPartitionOffsets.keySet) if (deletedPartitions.nonEmpty) { reportDataLoss(s"$deletedPartitions are gone. Some data may have been missed") } // Use the until partitions to calculate offset ranges to ignore partitions that have // been deleted val topicPartitions = untilPartitionOffsets.keySet.filter { tp => // Ignore partitions that we don't know the from offsets. newPartitionOffsets.contains(tp) || fromPartitionOffsets.contains(tp) }.toSeq logDebug("TopicPartitions: " + topicPartitions.mkString(", ")) val sortedExecutors = getSortedExecutorList(sc) val numExecutors = sortedExecutors.length logDebug("Sorted executors: " + sortedExecutors.mkString(", ")) // Calculate offset ranges val offsetRanges = topicPartitions.map { tp => val fromOffset = fromPartitionOffsets.getOrElse(tp, { newPartitionOffsets.getOrElse(tp, { // This should not happen since newPartitionOffsets contains all partitions not in // fromPartitionOffsets throw new IllegalStateException(s"$tp doesn't have a from offset") }) }) val untilOffset = untilPartitionOffsets(tp) val preferredLoc = if (numExecutors > 0) { // This allows cached RocketMQConsumers in the executors to be re-used to read the same // partition in every batch. Some(sortedExecutors(Math.floorMod(tp.hashCode, numExecutors))) } else None RocketMQSourceRDDOffsetRange(tp, fromOffset, untilOffset, preferredLoc) }.filter { range => if (range.untilOffset < range.fromOffset) { reportDataLoss(s"Partition ${range.messageQueue}'s offset was changed from " + s"${range.fromOffset} to ${range.untilOffset}, some data may have been missed") false } else { true } }.toArray // Create an RDD that reads from RocketMQ and get the (key, value) pair as byte arrays. val rdd = new RocketMQSourceRDD( sc, executorRocketMQParams, offsetRanges, pollTimeoutMs, failOnDataLoss, reuseRocketMQConsumer = true).map { cr => // Remove the `brokerName` property which was added by us. See `RocketMQSourceRDD.compute` val brokerName = cr.getProperties.remove(RocketMQSource.PROP_BROKER_NAME) InternalRow( UTF8String.fromString(cr.getTopic), // topic cr.getFlag, // flag cr.getBody, // body UTF8String.fromString(JsonUtils.messageProperties(cr.getProperties)), // properties UTF8String.fromString(brokerName), // brokerName cr.getQueueId, // queueId cr.getQueueOffset, // queueOffset DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.getBornTimestamp)), // bornTimestamp DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.getStoreTimestamp)) // storeTimestamp ) } logInfo("GetBatch generating RDD of offset range: " + offsetRanges.sortBy(_.messageQueue.toString).mkString(", ")) sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true) } /** Stop this source and free any resources it has allocated. */ override def stop(): Unit = synchronized { offsetReader.close() } override def toString: String = s"RocketMQSource[$offsetReader]" /** * If `failOnDataLoss` is true, this method will throw an `IllegalStateException`. * Otherwise, just log a warning. */ private def reportDataLoss(message: String): Unit = { if (failOnDataLoss) { throw new IllegalStateException(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE") } else { logWarning(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE") } } } /** Companion object for the [[RocketMQSource]]. */ private object RocketMQSource { val INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE = """ |Some data may have been lost because they are not available in RocketMQ any more; either the | data was aged out by RocketMQ or the topic may have been deleted before all the data in the | topic was processed. If you want your streaming query to fail on such cases, set the source | option "failOnDataLoss" to "true". """.stripMargin val INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE = """ |Some data may have been lost because they are not available in RocketMQ any more; either the | data was aged out by RocketMQ or the topic may have been deleted before all the data in the | topic was processed. If you don't want your streaming query to fail on such cases, set the | source option "failOnDataLoss" to "false". """.stripMargin val VERSION = 1 val PROP_BROKER_NAME = "_brokerName" def getSortedExecutorList(sc: SparkContext): Array[String] = { val bm = sc.env.blockManager bm.master.getPeers(bm.blockManagerId).toArray .map(x => ExecutorCacheTaskLocation(x.host, x.executorId)) .sortWith(compare) .map(_.toString) } private def compare(a: ExecutorCacheTaskLocation, b: ExecutorCacheTaskLocation): Boolean = { if (a.host == b.host) { a.executorId > b.executorId } else { a.host > b.host } } def schema: StructType = StructType(Seq( // fields of `Message` StructField("topic", StringType), StructField("flag", IntegerType), StructField("body", BinaryType), StructField("properties", StringType), // fields of `MessageExt` StructField("brokerName", StringType), StructField("queueId", IntegerType), StructField("queueOffset", LongType), StructField("bornTimestamp", TimestampType), StructField("storeTimestamp", TimestampType) )) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-3.2/org/apache/spark/sql/rocketmq/RocketMQSourceOffset.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSourceOffset.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import org.apache.rocketmq.common.message.MessageQueue import org.apache.spark.sql.connector.read.streaming.PartitionOffset import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset} /** * An [[Offset]] for the [[RocketMQSource]]. This one tracks all partitions of subscribed topics and * their offsets. */ private[rocketmq] case class RocketMQSourceOffset(queueToOffsets: Map[MessageQueue, Long]) extends Offset { override val json = JsonUtils.partitionOffsets(queueToOffsets) } private[rocketmq] case class RocketMQSourcePartitionOffset(messageQueue: MessageQueue, queueOffset: Long) extends PartitionOffset /** Companion object of the [[RocketMQSourceOffset]] */ private[rocketmq] object RocketMQSourceOffset { def getPartitionOffsets(offset: Offset): Map[MessageQueue, Long] = { offset match { case o: RocketMQSourceOffset => o.queueToOffsets case so: SerializedOffset => RocketMQSourceOffset(so).queueToOffsets case _ => throw new IllegalArgumentException( s"Invalid conversion from offset of ${offset.getClass} to RocketMQSourceOffset") } } /** * Returns [[RocketMQSourceOffset]] from a variable sequence of (topic, brokerName, queueId, offset) * tuples. */ def apply(offsetTuples: (String, String, Int, Long)*): RocketMQSourceOffset = { RocketMQSourceOffset(offsetTuples.map { case(t, b, q, o) => (new MessageQueue(t, b, q), o) }.toMap) } /** * Returns [[RocketMQSourceOffset]] from a JSON [[SerializedOffset]] */ def apply(offset: SerializedOffset): RocketMQSourceOffset = RocketMQSourceOffset(JsonUtils.partitionOffsets(offset.json)) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-3.2/org/apache/spark/sql/rocketmq/RocketMQSourceRDD.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSourceRDD.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import org.apache.rocketmq.common.message.{MessageExt, MessageQueue} import org.apache.spark.partial.{BoundedDouble, PartialResult} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.util.NextIterator import org.apache.spark.{Partition, SparkContext, TaskContext} import java.{util => ju} import scala.collection.mutable.ArrayBuffer /** Offset range that one partition of the RocketMQSourceRDD has to read */ private[rocketmq] case class RocketMQSourceRDDOffsetRange( messageQueue: MessageQueue, fromOffset: Long, untilOffset: Long, preferredLoc: Option[String]) { def size: Long = untilOffset - fromOffset } /** Partition of the RocketMQSourceRDD */ private[rocketmq] case class RocketMQSourceRDDPartition(index: Int, offsetRange: RocketMQSourceRDDOffsetRange) extends Partition /** * An RDD that reads data from RocketMQ based on offset ranges across multiple partitions. * Additionally, it allows preferred locations to be set for each topic + partition, so that * the [[RocketMQSource]] can ensure the same executor always reads the same topic + partition * and cached RocketMQConsumers (see [[CachedRocketMQConsumer]] can be used read data efficiently. * * @param sc the [[SparkContext]] * @param executorRocketMQParams RocketMQ configuration for creating RocketMQConsumer on the executors * @param offsetRanges Offset ranges that define the RocketMQ data belonging to this RDD */ private[rocketmq] class RocketMQSourceRDD( sc: SparkContext, executorRocketMQParams: ju.Map[String, String], offsetRanges: Seq[RocketMQSourceRDDOffsetRange], pollTimeoutMs: Long, failOnDataLoss: Boolean, reuseRocketMQConsumer: Boolean) extends RDD[MessageExt](sc, Nil) { override def persist(newLevel: StorageLevel): this.type = { logError("RocketMQ ConsumerRecord is not serializable. " + "Use .map to extract fields before calling .persist or .window") super.persist(newLevel) } override def getPartitions: Array[Partition] = { offsetRanges.zipWithIndex.map { case (o, i) => RocketMQSourceRDDPartition(i, o) }.toArray } override def count(): Long = offsetRanges.map(_.size).sum override def countApprox(timeout: Long, confidence: Double): PartialResult[BoundedDouble] = { val c = count() new PartialResult(new BoundedDouble(c, 1.0, c, c), true) } override def isEmpty(): Boolean = count == 0L override def take(num: Int): Array[MessageExt] = { val nonEmptyPartitions = this.partitions .map(_.asInstanceOf[RocketMQSourceRDDPartition]) .filter(_.offsetRange.size > 0) if (num < 1 || nonEmptyPartitions.isEmpty) { return new Array[MessageExt](0) } // Determine in advance how many messages need to be taken from each partition val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) => val remain = num - result.values.sum if (remain > 0) { val taken = Math.min(remain, part.offsetRange.size) result + (part.index -> taken.toInt) } else { result } } val buf = new ArrayBuffer[MessageExt] val res = context.runJob( this, (tc: TaskContext, it: Iterator[MessageExt]) => it.take(parts(tc.partitionId())).toArray, parts.keys.toArray ) res.foreach(buf ++= _) buf.toArray } override def getPreferredLocations(split: Partition): Seq[String] = { val part = split.asInstanceOf[RocketMQSourceRDDPartition] part.offsetRange.preferredLoc.map(Seq(_)).getOrElse(Seq.empty) } override def compute( thePart: Partition, context: TaskContext): Iterator[MessageExt] = { val sourcePartition = thePart.asInstanceOf[RocketMQSourceRDDPartition] val consumer = if (!reuseRocketMQConsumer) { CachedRocketMQConsumer.getOrCreate(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } else { CachedRocketMQConsumer.createUncached(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } val range = resolveRange(consumer, sourcePartition.offsetRange) assert( range.fromOffset <= range.untilOffset, s"Beginning offset ${range.fromOffset} is after the ending offset ${range.untilOffset} for " + s"${range.messageQueue}. You either provided an invalid fromOffset, or the RocketMQ topic has been damaged") if (range.fromOffset == range.untilOffset) { logInfo(s"Beginning offset ${range.fromOffset} is the same as ending offset, " + s"skipping ${range.messageQueue}") Iterator.empty } else { val underlying = new NextIterator[MessageExt]() { private var requestOffset = range.fromOffset override def getNext(): MessageExt = { if (requestOffset >= range.untilOffset) { // Processed all offsets in this partition. finished = true null } else { val r = consumer.get(requestOffset, range.untilOffset, pollTimeoutMs, failOnDataLoss) if (r == null) { // Losing some data. Skip the rest offsets in this partition. finished = true null } else { requestOffset = r.getQueueOffset + 1 // The MessageExt structure does not contains any field of `brokerName`, so put one into properties r.putUserProperty(RocketMQSource.PROP_BROKER_NAME, sourcePartition.offsetRange.messageQueue.getBrokerName) r } } } override protected def close(): Unit = { if (!reuseRocketMQConsumer) { consumer.close() } else { CachedRocketMQConsumer.releaseConsumer(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } } } // Release consumer, either by removing it or indicating we're no longer using it context.addTaskCompletionListener[Unit] { _ => underlying.closeIfNeeded() } underlying } } /** * Resolve the EARLIEST/LATEST placeholder in range * @return the range with actual boundary */ private def resolveRange(consumer: CachedRocketMQConsumer, range: RocketMQSourceRDDOffsetRange) = { if (range.fromOffset < 0 || range.untilOffset < 0) { // Late bind the offset range val availableOffsetRange = consumer.getAvailableOffsetRange() val fromOffset = if (range.fromOffset < 0) { assert(range.fromOffset == RocketMQOffsetRangeLimit.EARLIEST, s"earliest offset ${range.fromOffset} does not equal ${RocketMQOffsetRangeLimit.EARLIEST}") availableOffsetRange.earliest } else { range.fromOffset } val untilOffset = if (range.untilOffset < 0) { assert(range.untilOffset == RocketMQOffsetRangeLimit.LATEST, s"latest offset ${range.untilOffset} does not equal ${RocketMQOffsetRangeLimit.LATEST}") availableOffsetRange.latest } else { range.untilOffset } RocketMQSourceRDDOffsetRange(range.messageQueue, fromOffset, untilOffset, range.preferredLoc) } else { range } } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-3.2/org/apache/spark/streaming/RocketMqRDD.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.streaming import org.apache.rocketmq.common.message.MessageExt import org.apache.rocketmq.spark._ import org.apache.spark.partial.{BoundedDouble, PartialResult} import org.apache.spark.rdd.RDD import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.storage.StorageLevel import org.apache.spark.{Partition, SparkContext, TaskContext} import java.{util => ju} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer /** * A batch-oriented interface for consuming from RocketMq. * Starting and ending offsets are specified in advance, * so that you can control exactly-once semantics. * @param groupId it is for rocketMq for identifying the consumer * @param optionParams the configs * @param offsetRanges offset ranges that define the RocketMq data belonging to this RDD * @param preferredHosts map from TopicQueueId to preferred host for processing that partition. * In most cases, use [[LocationStrategy.PreferConsistent]] * @param useConsumerCache useConsumerCache whether to use a consumer from a per-jvm cache */ class RocketMqRDD ( sc: SparkContext, val groupId: String, val optionParams: ju.Map[String, String], val offsetRanges: ju.Map[TopicQueueId, Array[OffsetRange]], val preferredHosts: ju.Map[TopicQueueId, String], val useConsumerCache: Boolean )extends RDD[MessageExt](sc, Nil) with HasOffsetRanges{ private val cacheInitialCapacity = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_INIT_CAPACITY, "16").toInt private val cacheMaxCapacity = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_MAX_CAPACITY, "64").toInt private val cacheLoadFactor = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_LOAD_FACTOR, "0.75").toFloat override def persist(newLevel: StorageLevel): this.type = { super.persist(newLevel) } override def getPartitions: Array[Partition] = { offsetRanges.asScala.toArray.zipWithIndex.map{ case ((first, second), i) => new RocketMqRDDPartition(i, first.topic, first.queueId, second) }.toArray } override def count(): Long = offsetRanges.asScala.map(_._2.map(_.count).sum).sum override def countApprox( timeout: Long, confidence: Double = 0.95 ): PartialResult[BoundedDouble] = { val c = count new PartialResult(new BoundedDouble(c, 1.0, c, c), true) } override def isEmpty(): Boolean = count == 0L override def take(num: Int): Array[MessageExt] = { val nonEmptyPartitions = this.partitions .map(_.asInstanceOf[RocketMqRDDPartition]) .filter(_.count > 0) if (num < 1 || nonEmptyPartitions.isEmpty) { return new Array[MessageExt](0) } // Determine in advance how many messages need to be taken from each partition val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) => val remain = num - result.values.sum if (remain > 0) { val taken = Math.min(remain, part.count) result + (part.index -> taken.toInt) } else { result } } val buf = new ArrayBuffer[MessageExt] val res = context.runJob( this, (tc: TaskContext, it: Iterator[MessageExt]) => it.take(parts(tc.partitionId)).toArray, parts.keys.toArray ) res.foreach(buf ++= _) buf.toArray } private def executors(): Array[ExecutorCacheTaskLocation] = { val bm = sparkContext.env.blockManager bm.master.getPeers(bm.blockManagerId).toArray .map(x => ExecutorCacheTaskLocation(x.host, x.executorId)) .sortWith(compareExecutors) } private def compareExecutors( a: ExecutorCacheTaskLocation, b: ExecutorCacheTaskLocation): Boolean = if (a.host == b.host) { a.executorId > b.executorId } else { a.host > b.host } /** * Non-negative modulus, from java 8 math */ private def floorMod(a: Int, b: Int): Int = ((a % b) + b) % b protected override def getPreferredLocations(thePart: Partition): Seq[String] = { // The intention is best-effort consistent executor for a given topic partition, // so that caching consumers can be effective. val part = thePart.asInstanceOf[RocketMqRDDPartition] val allExecs = executors() val tp = part.topicQueueId() val prefHost = preferredHosts.get(tp) val prefExecs = if (null == prefHost) allExecs else allExecs.filter(_.host == prefHost) val execs = if (prefExecs.isEmpty) allExecs else prefExecs if (execs.isEmpty) { Seq() } else { // execs is sorted, tp.hashCode depends only on topic and partition, so consistent index val index = this.floorMod(tp.hashCode, execs.length) val chosen = execs(index) Seq(chosen.toString) } } private def errBeginAfterEnd(part: RocketMqRDDPartition): String = s"Beginning offset is after the ending offset ${part.partitionOffsetRanges.mkString(",")} " + s"for topic ${part.topic} partition ${part.index}. " + "You either provided an invalid fromOffset, or the Kafka topic has been damaged" override def compute(thePart: Partition, context: TaskContext): Iterator[MessageExt] = { val part = thePart.asInstanceOf[RocketMqRDDPartition] val count = part.count() assert(count >= 0, errBeginAfterEnd(part)) if (count == 0) { logInfo(s"Beginning offset is the same as ending offset " + s"skipping ${part.topic} ${part.queueId}") Iterator.empty } else { new RocketMqRDDIterator(part, context) } } /** * An iterator that fetches messages directly from rocketmq for the offsets in partition. * Uses a cached consumer where possible to take advantage of prefetching */ private class RocketMqRDDIterator( part: RocketMqRDDPartition, context: TaskContext) extends Iterator[MessageExt] { logDebug(s"Computing topic ${part.topic}, queueId ${part.queueId} " + s"offsets ${part.partitionOffsetRanges.mkString(",")}") context.addTaskCompletionListener[Unit]{ context => closeIfNeeded() } val consumer = if (useConsumerCache) { CachedMQConsumer.init(cacheInitialCapacity, cacheMaxCapacity, cacheLoadFactor) if (context.attemptNumber > 5) { // just in case the prior attempt failures were cache related CachedMQConsumer.remove(groupId, part.topic, part.queueId, part.brokerNames) } CachedMQConsumer.getOrCreate(groupId, part.topic, part.queueId, part.brokerNames, optionParams) } else { CachedMQConsumer.getUncached(groupId, part.topic, part.queueId, part.brokerNames, optionParams) } var logicTotalOffset = 0 val totalSum = part.partitionOffsetRanges.map(_.count).sum var index = 0 var requestOffset = part.partitionOffsetRanges.apply(index).fromOffset def closeIfNeeded(): Unit = { if (!useConsumerCache && consumer != null) { consumer.client.shutdown } } override def hasNext(): Boolean = { totalSum > logicTotalOffset } override def next(): MessageExt = { assert(hasNext(), "Can't call getNext() once untilOffset has been reached") val queueRange = part.partitionOffsetRanges.apply(index) val r = consumer.get(queueRange.brokerName, requestOffset) if (queueRange.untilOffset > (requestOffset + 1)) requestOffset +=1 else { index +=1 if (part.partitionOffsetRanges.length > index) requestOffset = part.partitionOffsetRanges.apply(index).fromOffset } logicTotalOffset += 1 r } } private[RocketMqRDD] type OffsetRangeTuple = (String, Int) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-3.3/org/apache/spark/sql/rocketmq/RocketMQSource.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSource.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ * 2. Schema of output dataframe adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import java.io._ import java.nio.charset.StandardCharsets import java.{util => ju} import org.apache.commons.io.IOUtils import org.apache.rocketmq.common.message.MessageQueue import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.rocketmq.RocketMQSource._ import org.apache.spark.sql.types.{StructField, _} import org.apache.spark.unsafe.types.UTF8String /** * A [[Source]] that reads data from RocketMQ using the following design. * * - The [[RocketMQSourceOffset]] is the custom [[Offset]] defined for this source that contains * a map of MessageQueue -> offset. Note that this offset is 1 + (available offset). For * example if the last record in a RocketMQ topic "t", partition 2 is offset 5, then * RocketMQSourceOffset will contain MessageQueue("t", 2) -> 6. This is done keep it consistent * with the semantics of `MQPullConsumer.fetchConsumeOffset()`. * * - The [[RocketMQSource]] written to do the following. * * - As soon as the source is created, the pre-configured [[RocketMQOffsetReader]] * is used to query the initial offsets that this source should * start reading from. This is used to create the first batch. * * - `getOffset()` uses the [[RocketMQOffsetReader]] to query the latest * available offsets, which are returned as a [[RocketMQSourceOffset]]. * * - `getBatch()` returns a DF that reads from the 'start offset' until the 'end offset' in * for each partition. The end offset is excluded to be consistent with the semantics of * [[RocketMQSourceOffset]] and `MQPullConsumer.fetchConsumeOffset()`. * * - The DF returned is based on [[RocketMQSourceRDD]] which is constructed such that the * data from RocketMQ topic + partition is consistently read by the same executors across * batches, and cached RocketMQConsumers in the executors can be reused efficiently. See the * docs on [[RocketMQSourceRDD]] for more details. * * Zero data lost is not guaranteed when topics are deleted. If zero data lost is critical, the user * must make sure all messages in a topic have been processed when deleting a topic. */ private class RocketMQSource( sqlContext: SQLContext, offsetReader: RocketMQOffsetReader, executorRocketMQParams: ju.Map[String, String], sourceOptions: Map[String, String], metadataPath: String, startingOffsets: RocketMQOffsetRangeLimit, failOnDataLoss: Boolean) extends Source with Logging { private val sc = sqlContext.sparkContext private val pollTimeoutMs = sourceOptions.getOrElse( RocketMQConf.PULL_TIMEOUT_MS, sc.conf.getTimeAsMs("spark.network.timeout", "120s").toString ).toLong private val maxOffsetsPerTrigger = sourceOptions.get("maxOffsetsPerTrigger").map(_.toLong) /** * Lazily initialize `initialPartitionOffsets` to make sure that `RocketMQConsumer.pull` is only * called in StreamExecutionThread. */ private lazy val initialPartitionOffsets = { val metadataLog = new HDFSMetadataLog[RocketMQSourceOffset](sqlContext.sparkSession, metadataPath) { override def serialize(metadata: RocketMQSourceOffset, out: OutputStream): Unit = { out.write(0) // A zero byte is written to support Spark 2.1.0 (SPARK-19517) val writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8)) writer.write("v" + VERSION + "\n") writer.write(metadata.json) writer.flush() } override def deserialize(in: InputStream): RocketMQSourceOffset = { in.read() // A zero byte is read to support Spark 2.1.0 (SPARK-19517) val content = IOUtils.toString(new InputStreamReader(in, StandardCharsets.UTF_8)) // HDFSMetadataLog guarantees that it never creates a partial file. assert(content.length != 0) if (content(0) == 'v') { val indexOfNewLine = content.indexOf("\n") if (indexOfNewLine > 0) { val version = validateVersion(content.substring(0, indexOfNewLine), VERSION) RocketMQSourceOffset(SerializedOffset(content.substring(indexOfNewLine + 1))) } else { throw new IllegalStateException( s"Log file was malformed: failed to detect the log file version line.") } } else { // The log was generated by Spark 2.1.0 RocketMQSourceOffset(SerializedOffset(content)) } } } metadataLog.get(0).getOrElse { val offsets = startingOffsets match { case EarliestOffsetRangeLimit => RocketMQSourceOffset(offsetReader.fetchEarliestOffsets()) case LatestOffsetRangeLimit => RocketMQSourceOffset(offsetReader.fetchLatestOffsets()) case SpecificOffsetRangeLimit(p) => offsetReader.fetchSpecificOffsets(p, reportDataLoss) } metadataLog.add(0, offsets) logInfo(s"Initial offsets: $offsets") offsets }.queueToOffsets } private var currentPartitionOffsets: Option[Map[MessageQueue, Long]] = None override def schema: StructType = RocketMQSource.schema /** Returns the maximum available offset for this source. */ override def getOffset: Option[Offset] = { // Make sure initialPartitionOffsets is initialized initialPartitionOffsets val latest = offsetReader.fetchLatestOffsets() val offsets = maxOffsetsPerTrigger match { case None => latest case Some(limit) if currentPartitionOffsets.isEmpty => rateLimit(limit, initialPartitionOffsets, latest) case Some(limit) => rateLimit(limit, currentPartitionOffsets.get, latest) } currentPartitionOffsets = Some(offsets) logDebug(s"GetOffset: ${offsets.toSeq.map(_.toString).sorted}") Some(RocketMQSourceOffset(offsets)) } /** Proportionally distribute limit number of offsets among message queues */ private def rateLimit( limit: Long, from: Map[MessageQueue, Long], until: Map[MessageQueue, Long]): Map[MessageQueue, Long] = { val fromNew = offsetReader.fetchEarliestOffsets(until.keySet.diff(from.keySet).toSeq) val sizes = until.flatMap { case (tp, end) => // If begin isn't defined, something's wrong, but let alert logic in getBatch handle it from.get(tp).orElse(fromNew.get(tp)).flatMap { begin => val size = end - begin logDebug(s"rateLimit $tp size is $size") if (size > 0) Some(tp -> size) else None } } val total = sizes.values.sum.toDouble if (total < 1) { until } else { until.map { case (tp, end) => tp -> sizes.get(tp).map { size => val begin = from.get(tp).getOrElse(fromNew(tp)) val prorate = limit * (size / total) logDebug(s"rateLimit $tp prorated amount is $prorate") // Don't completely starve small topicpartitions val off = begin + (if (prorate < 1) Math.ceil(prorate) else Math.floor(prorate)).toLong logDebug(s"rateLimit $tp new offset is $off") // Paranoia, make sure not to return an offset that's past end Math.min(end, off) }.getOrElse(end) } } } /** * Returns the data that is between the offsets * [`start.get.partitionToOffsets`, `end.partitionToOffsets`), i.e. end.partitionToOffsets is * exclusive. */ override def getBatch(start: Option[Offset], end: Offset): DataFrame = { // Make sure initialPartitionOffsets is initialized initialPartitionOffsets logInfo(s"GetBatch called with start = $start, end = $end") val untilPartitionOffsets = RocketMQSourceOffset.getPartitionOffsets(end) // On recovery, getBatch will get called before getOffset if (currentPartitionOffsets.isEmpty) { currentPartitionOffsets = Some(untilPartitionOffsets) } if (start.isDefined && start.get == end) { return sqlContext.internalCreateDataFrame( sqlContext.sparkContext.emptyRDD, schema, isStreaming = true) } val fromPartitionOffsets = start match { case Some(prevBatchEndOffset) => RocketMQSourceOffset.getPartitionOffsets(prevBatchEndOffset) case None => initialPartitionOffsets } // Find the new partitions, and get their earliest offsets val newPartitions = untilPartitionOffsets.keySet.diff(fromPartitionOffsets.keySet) val newPartitionOffsets = offsetReader.fetchEarliestOffsets(newPartitions.toSeq) if (newPartitionOffsets.keySet != newPartitions) { // We cannot get from offsets for some partitions. It means they got deleted. val deletedPartitions = newPartitions.diff(newPartitionOffsets.keySet) reportDataLoss( s"Cannot find earliest offsets of $deletedPartitions. Some data may have been missed") } logInfo(s"Partitions added: $newPartitionOffsets") newPartitionOffsets.filter(_._2 != 0).foreach { case (p, o) => reportDataLoss( s"Added partition $p starts from $o instead of 0. Some data may have been missed") } val deletedPartitions = fromPartitionOffsets.keySet.diff(untilPartitionOffsets.keySet) if (deletedPartitions.nonEmpty) { reportDataLoss(s"$deletedPartitions are gone. Some data may have been missed") } // Use the until partitions to calculate offset ranges to ignore partitions that have // been deleted val topicPartitions = untilPartitionOffsets.keySet.filter { tp => // Ignore partitions that we don't know the from offsets. newPartitionOffsets.contains(tp) || fromPartitionOffsets.contains(tp) }.toSeq logDebug("TopicPartitions: " + topicPartitions.mkString(", ")) val sortedExecutors = getSortedExecutorList(sc) val numExecutors = sortedExecutors.length logDebug("Sorted executors: " + sortedExecutors.mkString(", ")) // Calculate offset ranges val offsetRanges = topicPartitions.map { tp => val fromOffset = fromPartitionOffsets.getOrElse(tp, { newPartitionOffsets.getOrElse(tp, { // This should not happen since newPartitionOffsets contains all partitions not in // fromPartitionOffsets throw new IllegalStateException(s"$tp doesn't have a from offset") }) }) val untilOffset = untilPartitionOffsets(tp) val preferredLoc = if (numExecutors > 0) { // This allows cached RocketMQConsumers in the executors to be re-used to read the same // partition in every batch. Some(sortedExecutors(Math.floorMod(tp.hashCode, numExecutors))) } else None RocketMQSourceRDDOffsetRange(tp, fromOffset, untilOffset, preferredLoc) }.filter { range => if (range.untilOffset < range.fromOffset) { reportDataLoss(s"Partition ${range.messageQueue}'s offset was changed from " + s"${range.fromOffset} to ${range.untilOffset}, some data may have been missed") false } else { true } }.toArray // Create an RDD that reads from RocketMQ and get the (key, value) pair as byte arrays. val rdd = new RocketMQSourceRDD( sc, executorRocketMQParams, offsetRanges, pollTimeoutMs, failOnDataLoss, reuseRocketMQConsumer = true).map { cr => // Remove the `brokerName` property which was added by us. See `RocketMQSourceRDD.compute` val brokerName = cr.getProperties.remove(RocketMQSource.PROP_BROKER_NAME) InternalRow( UTF8String.fromString(cr.getTopic), // topic cr.getFlag, // flag cr.getBody, // body UTF8String.fromString(JsonUtils.messageProperties(cr.getProperties)), // properties UTF8String.fromString(brokerName), // brokerName cr.getQueueId, // queueId cr.getQueueOffset, // queueOffset DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.getBornTimestamp)), // bornTimestamp DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.getStoreTimestamp)) // storeTimestamp ) } logInfo("GetBatch generating RDD of offset range: " + offsetRanges.sortBy(_.messageQueue.toString).mkString(", ")) sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true) } /** Stop this source and free any resources it has allocated. */ override def stop(): Unit = synchronized { offsetReader.close() } override def toString: String = s"RocketMQSource[$offsetReader]" /** * If `failOnDataLoss` is true, this method will throw an `IllegalStateException`. * Otherwise, just log a warning. */ private def reportDataLoss(message: String): Unit = { if (failOnDataLoss) { throw new IllegalStateException(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE") } else { logWarning(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE") } } } /** Companion object for the [[RocketMQSource]]. */ private object RocketMQSource { val INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE = """ |Some data may have been lost because they are not available in RocketMQ any more; either the | data was aged out by RocketMQ or the topic may have been deleted before all the data in the | topic was processed. If you want your streaming query to fail on such cases, set the source | option "failOnDataLoss" to "true". """.stripMargin val INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE = """ |Some data may have been lost because they are not available in RocketMQ any more; either the | data was aged out by RocketMQ or the topic may have been deleted before all the data in the | topic was processed. If you don't want your streaming query to fail on such cases, set the | source option "failOnDataLoss" to "false". """.stripMargin val VERSION = 1 val PROP_BROKER_NAME = "_brokerName" def getSortedExecutorList(sc: SparkContext): Array[String] = { val bm = sc.env.blockManager bm.master.getPeers(bm.blockManagerId).toArray .map(x => ExecutorCacheTaskLocation(x.host, x.executorId)) .sortWith(compare) .map(_.toString) } private def compare(a: ExecutorCacheTaskLocation, b: ExecutorCacheTaskLocation): Boolean = { if (a.host == b.host) { a.executorId > b.executorId } else { a.host > b.host } } def schema: StructType = StructType(Seq( // fields of `Message` StructField("topic", StringType), StructField("flag", IntegerType), StructField("body", BinaryType), StructField("properties", StringType), // fields of `MessageExt` StructField("brokerName", StringType), StructField("queueId", IntegerType), StructField("queueOffset", LongType), StructField("bornTimestamp", TimestampType), StructField("storeTimestamp", TimestampType) )) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-3.3/org/apache/spark/sql/rocketmq/RocketMQSourceOffset.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSourceOffset.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import org.apache.rocketmq.common.message.MessageQueue import org.apache.spark.sql.connector.read.streaming.PartitionOffset import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset} /** * An [[Offset]] for the [[RocketMQSource]]. This one tracks all partitions of subscribed topics and * their offsets. */ private[rocketmq] case class RocketMQSourceOffset(queueToOffsets: Map[MessageQueue, Long]) extends Offset { override val json = JsonUtils.partitionOffsets(queueToOffsets) } private[rocketmq] case class RocketMQSourcePartitionOffset(messageQueue: MessageQueue, queueOffset: Long) extends PartitionOffset /** Companion object of the [[RocketMQSourceOffset]] */ private[rocketmq] object RocketMQSourceOffset { def getPartitionOffsets(offset: Offset): Map[MessageQueue, Long] = { offset match { case o: RocketMQSourceOffset => o.queueToOffsets case so: SerializedOffset => RocketMQSourceOffset(so).queueToOffsets case _ => throw new IllegalArgumentException( s"Invalid conversion from offset of ${offset.getClass} to RocketMQSourceOffset") } } /** * Returns [[RocketMQSourceOffset]] from a variable sequence of (topic, brokerName, queueId, offset) * tuples. */ def apply(offsetTuples: (String, String, Int, Long)*): RocketMQSourceOffset = { RocketMQSourceOffset(offsetTuples.map { case(t, b, q, o) => (new MessageQueue(t, b, q), o) }.toMap) } /** * Returns [[RocketMQSourceOffset]] from a JSON [[SerializedOffset]] */ def apply(offset: SerializedOffset): RocketMQSourceOffset = RocketMQSourceOffset(JsonUtils.partitionOffsets(offset.json)) } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-3.3/org/apache/spark/sql/rocketmq/RocketMQSourceRDD.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * This file was taken from Apache Spark org/apache/spark/sql/kafka010/KafkaSourceRDD.scala * * There are some modifications: * 1. Parameters and API were adapted to RocketMQ */ package org.apache.spark.sql.rocketmq import org.apache.rocketmq.common.message.{MessageExt, MessageQueue} import org.apache.spark.partial.{BoundedDouble, PartialResult} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.util.NextIterator import org.apache.spark.{Partition, SparkContext, TaskContext} import java.{util => ju} import scala.collection.mutable.ArrayBuffer /** Offset range that one partition of the RocketMQSourceRDD has to read */ private[rocketmq] case class RocketMQSourceRDDOffsetRange( messageQueue: MessageQueue, fromOffset: Long, untilOffset: Long, preferredLoc: Option[String]) { def size: Long = untilOffset - fromOffset } /** Partition of the RocketMQSourceRDD */ private[rocketmq] case class RocketMQSourceRDDPartition(index: Int, offsetRange: RocketMQSourceRDDOffsetRange) extends Partition /** * An RDD that reads data from RocketMQ based on offset ranges across multiple partitions. * Additionally, it allows preferred locations to be set for each topic + partition, so that * the [[RocketMQSource]] can ensure the same executor always reads the same topic + partition * and cached RocketMQConsumers (see [[CachedRocketMQConsumer]] can be used read data efficiently. * * @param sc the [[SparkContext]] * @param executorRocketMQParams RocketMQ configuration for creating RocketMQConsumer on the executors * @param offsetRanges Offset ranges that define the RocketMQ data belonging to this RDD */ private[rocketmq] class RocketMQSourceRDD( sc: SparkContext, executorRocketMQParams: ju.Map[String, String], offsetRanges: Seq[RocketMQSourceRDDOffsetRange], pollTimeoutMs: Long, failOnDataLoss: Boolean, reuseRocketMQConsumer: Boolean) extends RDD[MessageExt](sc, Nil) { override def persist(newLevel: StorageLevel): this.type = { logError("RocketMQ ConsumerRecord is not serializable. " + "Use .map to extract fields before calling .persist or .window") super.persist(newLevel) } override def getPartitions: Array[Partition] = { offsetRanges.zipWithIndex.map { case (o, i) => RocketMQSourceRDDPartition(i, o) }.toArray } override def count(): Long = offsetRanges.map(_.size).sum override def countApprox(timeout: Long, confidence: Double): PartialResult[BoundedDouble] = { val c = count() new PartialResult(new BoundedDouble(c, 1.0, c, c), true) } override def isEmpty(): Boolean = count == 0L override def take(num: Int): Array[MessageExt] = { val nonEmptyPartitions = this.partitions .map(_.asInstanceOf[RocketMQSourceRDDPartition]) .filter(_.offsetRange.size > 0) if (num < 1 || nonEmptyPartitions.isEmpty) { return new Array[MessageExt](0) } // Determine in advance how many messages need to be taken from each partition val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) => val remain = num - result.values.sum if (remain > 0) { val taken = Math.min(remain, part.offsetRange.size) result + (part.index -> taken.toInt) } else { result } } val buf = new ArrayBuffer[MessageExt] val res = context.runJob( this, (tc: TaskContext, it: Iterator[MessageExt]) => it.take(parts(tc.partitionId())).toArray, parts.keys.toArray ) res.foreach(buf ++= _) buf.toArray } override def getPreferredLocations(split: Partition): Seq[String] = { val part = split.asInstanceOf[RocketMQSourceRDDPartition] part.offsetRange.preferredLoc.map(Seq(_)).getOrElse(Seq.empty) } override def compute( thePart: Partition, context: TaskContext): Iterator[MessageExt] = { val sourcePartition = thePart.asInstanceOf[RocketMQSourceRDDPartition] val consumer = if (!reuseRocketMQConsumer) { CachedRocketMQConsumer.getOrCreate(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } else { CachedRocketMQConsumer.createUncached(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } val range = resolveRange(consumer, sourcePartition.offsetRange) assert( range.fromOffset <= range.untilOffset, s"Beginning offset ${range.fromOffset} is after the ending offset ${range.untilOffset} for " + s"${range.messageQueue}. You either provided an invalid fromOffset, or the RocketMQ topic has been damaged") if (range.fromOffset == range.untilOffset) { logInfo(s"Beginning offset ${range.fromOffset} is the same as ending offset, " + s"skipping ${range.messageQueue}") Iterator.empty } else { val underlying = new NextIterator[MessageExt]() { private var requestOffset = range.fromOffset override def getNext(): MessageExt = { if (requestOffset >= range.untilOffset) { // Processed all offsets in this partition. finished = true null } else { val r = consumer.get(requestOffset, range.untilOffset, pollTimeoutMs, failOnDataLoss) if (r == null) { // Losing some data. Skip the rest offsets in this partition. finished = true null } else { requestOffset = r.getQueueOffset + 1 // The MessageExt structure does not contains any field of `brokerName`, so put one into properties r.putUserProperty(RocketMQSource.PROP_BROKER_NAME, sourcePartition.offsetRange.messageQueue.getBrokerName) r } } } override protected def close(): Unit = { if (!reuseRocketMQConsumer) { consumer.close() } else { CachedRocketMQConsumer.releaseConsumer(sourcePartition.offsetRange.messageQueue, executorRocketMQParams) } } } // Release consumer, either by removing it or indicating we're no longer using it context.addTaskCompletionListener[Unit] { _ => underlying.closeIfNeeded() } underlying } } /** * Resolve the EARLIEST/LATEST placeholder in range * @return the range with actual boundary */ private def resolveRange(consumer: CachedRocketMQConsumer, range: RocketMQSourceRDDOffsetRange) = { if (range.fromOffset < 0 || range.untilOffset < 0) { // Late bind the offset range val availableOffsetRange = consumer.getAvailableOffsetRange() val fromOffset = if (range.fromOffset < 0) { assert(range.fromOffset == RocketMQOffsetRangeLimit.EARLIEST, s"earliest offset ${range.fromOffset} does not equal ${RocketMQOffsetRangeLimit.EARLIEST}") availableOffsetRange.earliest } else { range.fromOffset } val untilOffset = if (range.untilOffset < 0) { assert(range.untilOffset == RocketMQOffsetRangeLimit.LATEST, s"latest offset ${range.untilOffset} does not equal ${RocketMQOffsetRangeLimit.LATEST}") availableOffsetRange.latest } else { range.untilOffset } RocketMQSourceRDDOffsetRange(range.messageQueue, fromOffset, untilOffset, range.preferredLoc) } else { range } } } ================================================ FILE: fire-connectors/spark-connectors/spark-rocketmq/src/main/scala-spark-3.3/org/apache/spark/streaming/RocketMqRDD.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.streaming import org.apache.rocketmq.common.message.MessageExt import org.apache.rocketmq.spark._ import org.apache.spark.partial.{BoundedDouble, PartialResult} import org.apache.spark.rdd.RDD import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.storage.StorageLevel import org.apache.spark.{Partition, SparkContext, TaskContext} import java.{util => ju} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer /** * A batch-oriented interface for consuming from RocketMq. * Starting and ending offsets are specified in advance, * so that you can control exactly-once semantics. * @param groupId it is for rocketMq for identifying the consumer * @param optionParams the configs * @param offsetRanges offset ranges that define the RocketMq data belonging to this RDD * @param preferredHosts map from TopicQueueId to preferred host for processing that partition. * In most cases, use [[LocationStrategy.PreferConsistent]] * @param useConsumerCache useConsumerCache whether to use a consumer from a per-jvm cache */ class RocketMqRDD ( sc: SparkContext, val groupId: String, val optionParams: ju.Map[String, String], val offsetRanges: ju.Map[TopicQueueId, Array[OffsetRange]], val preferredHosts: ju.Map[TopicQueueId, String], val useConsumerCache: Boolean )extends RDD[MessageExt](sc, Nil) with HasOffsetRanges{ private val cacheInitialCapacity = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_INIT_CAPACITY, "16").toInt private val cacheMaxCapacity = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_MAX_CAPACITY, "64").toInt private val cacheLoadFactor = optionParams.getOrDefault(RocketMQConfig.PULL_CONSUMER_CACHE_LOAD_FACTOR, "0.75").toFloat override def persist(newLevel: StorageLevel): this.type = { super.persist(newLevel) } override def getPartitions: Array[Partition] = { offsetRanges.asScala.toArray.zipWithIndex.map{ case ((first, second), i) => new RocketMqRDDPartition(i, first.topic, first.queueId, second) }.toArray } override def count(): Long = offsetRanges.asScala.map(_._2.map(_.count).sum).sum override def countApprox( timeout: Long, confidence: Double = 0.95 ): PartialResult[BoundedDouble] = { val c = count new PartialResult(new BoundedDouble(c, 1.0, c, c), true) } override def isEmpty(): Boolean = count == 0L override def take(num: Int): Array[MessageExt] = { val nonEmptyPartitions = this.partitions .map(_.asInstanceOf[RocketMqRDDPartition]) .filter(_.count > 0) if (num < 1 || nonEmptyPartitions.isEmpty) { return new Array[MessageExt](0) } // Determine in advance how many messages need to be taken from each partition val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) => val remain = num - result.values.sum if (remain > 0) { val taken = Math.min(remain, part.count) result + (part.index -> taken.toInt) } else { result } } val buf = new ArrayBuffer[MessageExt] val res = context.runJob( this, (tc: TaskContext, it: Iterator[MessageExt]) => it.take(parts(tc.partitionId)).toArray, parts.keys.toArray ) res.foreach(buf ++= _) buf.toArray } private def executors(): Array[ExecutorCacheTaskLocation] = { val bm = sparkContext.env.blockManager bm.master.getPeers(bm.blockManagerId).toArray .map(x => ExecutorCacheTaskLocation(x.host, x.executorId)) .sortWith(compareExecutors) } private def compareExecutors( a: ExecutorCacheTaskLocation, b: ExecutorCacheTaskLocation): Boolean = if (a.host == b.host) { a.executorId > b.executorId } else { a.host > b.host } /** * Non-negative modulus, from java 8 math */ private def floorMod(a: Int, b: Int): Int = ((a % b) + b) % b protected override def getPreferredLocations(thePart: Partition): Seq[String] = { // The intention is best-effort consistent executor for a given topic partition, // so that caching consumers can be effective. val part = thePart.asInstanceOf[RocketMqRDDPartition] val allExecs = executors() val tp = part.topicQueueId() val prefHost = preferredHosts.get(tp) val prefExecs = if (null == prefHost) allExecs else allExecs.filter(_.host == prefHost) val execs = if (prefExecs.isEmpty) allExecs else prefExecs if (execs.isEmpty) { Seq() } else { // execs is sorted, tp.hashCode depends only on topic and partition, so consistent index val index = this.floorMod(tp.hashCode, execs.length) val chosen = execs(index) Seq(chosen.toString) } } private def errBeginAfterEnd(part: RocketMqRDDPartition): String = s"Beginning offset is after the ending offset ${part.partitionOffsetRanges.mkString(",")} " + s"for topic ${part.topic} partition ${part.index}. " + "You either provided an invalid fromOffset, or the Kafka topic has been damaged" override def compute(thePart: Partition, context: TaskContext): Iterator[MessageExt] = { val part = thePart.asInstanceOf[RocketMqRDDPartition] val count = part.count() assert(count >= 0, errBeginAfterEnd(part)) if (count == 0) { logInfo(s"Beginning offset is the same as ending offset " + s"skipping ${part.topic} ${part.queueId}") Iterator.empty } else { new RocketMqRDDIterator(part, context) } } /** * An iterator that fetches messages directly from rocketmq for the offsets in partition. * Uses a cached consumer where possible to take advantage of prefetching */ private class RocketMqRDDIterator( part: RocketMqRDDPartition, context: TaskContext) extends Iterator[MessageExt] { logDebug(s"Computing topic ${part.topic}, queueId ${part.queueId} " + s"offsets ${part.partitionOffsetRanges.mkString(",")}") context.addTaskCompletionListener[Unit]{ context => closeIfNeeded() } val consumer = if (useConsumerCache) { CachedMQConsumer.init(cacheInitialCapacity, cacheMaxCapacity, cacheLoadFactor) if (context.attemptNumber > 5) { // just in case the prior attempt failures were cache related CachedMQConsumer.remove(groupId, part.topic, part.queueId, part.brokerNames) } CachedMQConsumer.getOrCreate(groupId, part.topic, part.queueId, part.brokerNames, optionParams) } else { CachedMQConsumer.getUncached(groupId, part.topic, part.queueId, part.brokerNames, optionParams) } var logicTotalOffset = 0 val totalSum = part.partitionOffsetRanges.map(_.count).sum var index = 0 var requestOffset = part.partitionOffsetRanges.apply(index).fromOffset def closeIfNeeded(): Unit = { if (!useConsumerCache && consumer != null) { consumer.client.shutdown } } override def hasNext(): Boolean = { totalSum > logicTotalOffset } override def next(): MessageExt = { assert(hasNext(), "Can't call getNext() once untilOffset has been reached") val queueRange = part.partitionOffsetRanges.apply(index) val r = consumer.get(queueRange.brokerName, requestOffset) if (queueRange.untilOffset > (requestOffset + 1)) requestOffset +=1 else { index +=1 if (part.partitionOffsetRanges.length > index) requestOffset = part.partitionOffsetRanges.apply(index).fromOffset } logicTotalOffset += 1 r } } private[RocketMqRDD] type OffsetRangeTuple = (String, Int) } ================================================ FILE: fire-core/pom.xml ================================================ 4.0.0 fire-core_${scala.binary.version} jar Fire : Core com.zto.fire fire-parent 2.3.2-SNAPSHOT ../pom.xml com.zto.fire fire-common_${scala.binary.version} ${fire.version} ${maven.scope} com.zto.fire fire-enhance-arthas_${scala.binary.version} ${fire.version} ${maven.scope} com.taobao.arthas arthas-agent-attach ${arthas.version} ${maven.scope} com.taobao.arthas arthas-packaging ${arthas.version} ${maven.scope} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/TimeCost.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core; import com.zto.fire.common.util.DateFormatUtils; import com.zto.fire.common.util.ExceptionBus; import com.zto.fire.common.util.OSUtils; import org.apache.htrace.fasterxml.jackson.annotation.JsonIgnore; import java.io.Serializable; import java.util.UUID; /** * 用于记录任务的执行时间 * * @author ChengLong 2019-6-10 16:16:16 */ public class TimeCost implements Serializable { // 异常信息 private String msg; // 耗时 private Long timeCost; private String ip; private String load; // 多核cpu使用率 private String cpuUsage; // 用于区分埋点日志和用户日志 private boolean isFire = false; private String id = UUID.randomUUID().toString(); // 任务的applicationId private static String applicationId; // 任务的main方法 private static String mainClass; // executorId private static String executorId; private Integer stageId; private Long taskId; private Integer partitionId; @JsonIgnore private Throwable exception; private String stackTraceInfo; private String level = "WARN"; private String module; private Integer io; private Long start; private String startTime; private String endTime; public String getId() { return id; } public String getLoad() { return load; } public String getMsg() { return msg; } public Long getTimeCost() { if (this.timeCost == null) { return System.currentTimeMillis() - this.start; } return timeCost; } public String getStartTime() { return startTime; } public void setStartTime(String startTime) { this.startTime = startTime; } public String getEndTime() { return endTime; } public void setEndTime(String endTime) { this.endTime = endTime; } public String getIp() { return ip; } public Integer getStageId() { return stageId; } public Long getTaskId() { return taskId; } public Integer getPartitionId() { return partitionId; } public Boolean getIsFire() { return isFire; } public static String getApplicationId() { return applicationId; } public static void setApplicationId(String applicationId) { TimeCost.applicationId = applicationId; } public static String getExecutorId() { return executorId; } public static String getMainClass() { return mainClass; } public static void setExecutorId(String executorId) { TimeCost.executorId = executorId; } public static void setMainClass(String mainClass) { TimeCost.mainClass = mainClass; } public void setMsg(String msg) { this.msg = msg; } public void setTimeCost(Long timeCost) { this.timeCost = timeCost; } public Boolean getFire() { return isFire; } public void setFire(Boolean fire) { isFire = fire; } public void setIp(String ip) { this.ip = ip; } public void setLoad(String load) { this.load = load; } public void setStageId(Integer stageId) { this.stageId = stageId; } public void setTaskId(Long taskId) { this.taskId = taskId; } public void setPartitionId(Integer partitionId) { this.partitionId = partitionId; } public Long getStart() { return start; } public void setStart(Long start) { this.start = start; } public String getStackTraceInfo() { return stackTraceInfo; } public void setStackTraceInfo(String stackTraceInfo) { this.stackTraceInfo = stackTraceInfo; } public String getModule() { return module; } public Integer getIo() { return io; } public String getLevel() { return level; } public void setLevel(String level) { this.level = level; } public String getCpuUsage() { return cpuUsage; } public void setCpuUsage(String cpuUsage) { this.cpuUsage = cpuUsage; } private String lable() { if (this.isFire) { return "fire"; } else { return "user"; } } @Override public String toString() { String baseInfo = "【" + this.lable() + "Log】 〖" + this.msg + "〗 start:" + this.startTime + " end:" + this.endTime + " cost:" + this.getTimeCost() + " ip:" + this.ip + " load:" + this.load + " cpuUsage:" + this.cpuUsage + " executor:" + this.executorId; if (!"driver".equalsIgnoreCase(this.executorId)) { baseInfo += " stage:" + this.stageId + " task:" + this.taskId; } if (this.isFire) { baseInfo += " module:" + this.module + " io:" + this.io; } return baseInfo; } private TimeCost() { this.start = System.currentTimeMillis(); this.startTime = DateFormatUtils.formatCurrentDateTime(); this.ip = OSUtils.getIp(); } /** * 构建一个TimCost对象 * * @return 返回TimeCost对象实例 */ public static TimeCost build() { return new TimeCost(); } /** * 设置必要的参数 * * @return 当前对象 */ public TimeCost info(String msg, String module, Integer io, Boolean isFire, Throwable exception) { this.timeCost = System.currentTimeMillis() - this.start; this.endTime = DateFormatUtils.formatCurrentDateTime(); this.exception = exception; this.msg = msg; this.module = module; this.io = io; if (isFire != null) this.isFire = isFire; if (exception != null) { this.stackTraceInfo = ExceptionBus.stackTrace(exception); this.level = "ERROR"; } return this; } } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/HBase.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行Hbase connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-04-26 13:36:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface HBase { /** * HBase集群连接信息:hbase.cluster */ String value() default ""; /** * HBase集群连接信息:hbase.cluster,同value */ String cluster() default ""; /** * 列族名称:hbase.column.family */ String family() default ""; /** * 每个线程最多insert的记录数:fire.hbase.batch.size */ int batchSize() default -1; /** * spark引擎:scan hbase后存放到rdd的多少个partition中:fire.hbase.scan.partitions */ int scanPartitions() default -1; /** * spark引擎:scan后的缓存级别:fire.hbase.storage.level */ String storageLevel() default ""; /** * flink引擎:sink hbase失败最大重试次数:hbase.max.retry */ int maxRetries() default -1; /** * WAL等级:hbase.durability */ String durability() default ""; /** * 是否启用表信息缓存,提高表是否存在判断的效率:fire.hbase.table.exists.cache.enable */ boolean tableMetaCache() default true; /** * hbase-client参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/HBase2.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行Hbase connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-04-26 13:36:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface HBase2 { /** * HBase集群连接信息:hbase.cluster */ String value() default ""; /** * HBase集群连接信息:hbase.cluster,同value */ String cluster() default ""; /** * 列族名称:hbase.column.family */ String family() default ""; /** * 每个线程最多insert的记录数:fire.hbase.batch.size */ int batchSize() default -1; /** * spark引擎:scan hbase后存放到rdd的多少个partition中:fire.hbase.scan.partitions */ int scanPartitions() default -1; /** * spark引擎:scan后的缓存级别:fire.hbase.storage.level */ String storageLevel() default ""; /** * flink引擎:sink hbase失败最大重试次数:hbase.max.retry */ int maxRetries() default -1; /** * WAL等级:hbase.durability */ String durability() default ""; /** * 是否启用表信息缓存,提高表是否存在判断的效率:fire.hbase.table.exists.cache.enable */ boolean tableMetaCache() default true; /** * hbase-client参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/HBase3.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行Hbase connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-04-26 13:36:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface HBase3 { /** * HBase集群连接信息:hbase.cluster */ String value() default ""; /** * HBase集群连接信息:hbase.cluster,同value */ String cluster() default ""; /** * 列族名称:hbase.column.family */ String family() default ""; /** * 每个线程最多insert的记录数:fire.hbase.batch.size */ int batchSize() default -1; /** * spark引擎:scan hbase后存放到rdd的多少个partition中:fire.hbase.scan.partitions */ int scanPartitions() default -1; /** * spark引擎:scan后的缓存级别:fire.hbase.storage.level */ String storageLevel() default ""; /** * flink引擎:sink hbase失败最大重试次数:hbase.max.retry */ int maxRetries() default -1; /** * WAL等级:hbase.durability */ String durability() default ""; /** * 是否启用表信息缓存,提高表是否存在判断的效率:fire.hbase.table.exists.cache.enable */ boolean tableMetaCache() default true; /** * hbase-client参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/HBase4.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行Hbase connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-06-16 14:36:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface HBase4 { /** * HBase集群连接信息:hbase.cluster */ String value() default ""; /** * HBase集群连接信息:hbase.cluster,同value */ String cluster() default ""; /** * 列族名称:hbase.column.family */ String family() default ""; /** * 每个线程最多insert的记录数:fire.hbase.batch.size */ int batchSize() default -1; /** * spark引擎:scan hbase后存放到rdd的多少个partition中:fire.hbase.scan.partitions */ int scanPartitions() default -1; /** * spark引擎:scan后的缓存级别:fire.hbase.storage.level */ String storageLevel() default ""; /** * flink引擎:sink hbase失败最大重试次数:hbase.max.retry */ int maxRetries() default -1; /** * WAL等级:hbase.durability */ String durability() default ""; /** * 是否启用表信息缓存,提高表是否存在判断的效率:fire.hbase.table.exists.cache.enable */ boolean tableMetaCache() default true; /** * hbase-client参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/HBase5.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行Hbase connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-06-16 14:36:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface HBase5 { /** * HBase集群连接信息:hbase.cluster */ String value() default ""; /** * HBase集群连接信息:hbase.cluster,同value */ String cluster() default ""; /** * 列族名称:hbase.column.family */ String family() default ""; /** * 每个线程最多insert的记录数:fire.hbase.batch.size */ int batchSize() default -1; /** * spark引擎:scan hbase后存放到rdd的多少个partition中:fire.hbase.scan.partitions */ int scanPartitions() default -1; /** * spark引擎:scan后的缓存级别:fire.hbase.storage.level */ String storageLevel() default ""; /** * flink引擎:sink hbase失败最大重试次数:hbase.max.retry */ int maxRetries() default -1; /** * WAL等级:hbase.durability */ String durability() default ""; /** * 是否启用表信息缓存,提高表是否存在判断的效率:fire.hbase.table.exists.cache.enable */ boolean tableMetaCache() default true; /** * hbase-client参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/Hive.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行Hive connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong * @Date 2022-04-26 13:46:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface Hive { /** * hive连接别名:hive.cluster */ String value() default ""; /** * hive连接别名:hive.cluster,同value */ String cluster() default ""; /** * hive的版本:hive.version */ String version() default ""; /** * 在flink中hive的catalog名称:hive.catalog.name */ String catalog() default ""; /** * 分区名称(dt、ds):default.table.partition.name */ String partition() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/Jdbc.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行Jdbc connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-04-26 13:56:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface Jdbc { /** * Jdbc的url,同value */ String url(); /** * jdbc 驱动类,不填可根据url自动推断 */ String driver() default ""; /** * jdbc的用户名 */ String username(); /** * jdbc的密码 */ String password() default ""; /** * 事务的隔离级别 */ String isolationLevel() default ""; /** * 连接池的最大连接数 */ int maxPoolSize() default -1; /** * 连接池最少连接数 */ int minPoolSize() default -1; /** * 连接池初始连接数 */ int initialPoolSize() default -1; /** * 连接池的增量 */ int acquireIncrement() default -1; /** * 连接的最大空闲时间 */ int maxIdleTime() default -1; /** * 多少条操作一次 */ int batchSize() default -1; /** * flink引擎:flush的间隔周期(ms) */ long flushInterval() default -1; /** * flink引擎:失败最大重试次数 */ int maxRetries() default -1; /** * spark引擎:scan后的缓存级别:fire.jdbc.storage.level */ String storageLevel() default ""; /** * spark引擎:select后存放到rdd的多少个partition中:fire.jdbc.query.partitions */ int queryPartitions() default -1; /** * 日志中打印的sql长度 */ int logSqlLength() default -1; /** * c3p0参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/Jdbc2.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行Jdbc connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-04-26 13:56:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface Jdbc2 { /** * Jdbc的url,同value */ String url(); /** * jdbc 驱动类,不填可根据url自动推断 */ String driver() default ""; /** * jdbc的用户名 */ String username(); /** * jdbc的密码 */ String password() default ""; /** * 事务的隔离级别 */ String isolationLevel() default ""; /** * 连接池的最大连接数 */ int maxPoolSize() default -1; /** * 连接池最少连接数 */ int minPoolSize() default -1; /** * 连接池初始连接数 */ int initialPoolSize() default -1; /** * 连接池的增量 */ int acquireIncrement() default -1; /** * 连接的最大空闲时间 */ int maxIdleTime() default -1; /** * 多少条操作一次 */ int batchSize() default -1; /** * flink引擎:flush的间隔周期(ms) */ long flushInterval() default -1; /** * flink引擎:失败最大重试次数 */ int maxRetries() default -1; /** * spark引擎:scan后的缓存级别:fire.jdbc.storage.level */ String storageLevel() default ""; /** * spark引擎:select后存放到rdd的多少个partition中:fire.jdbc.query.partitions */ int queryPartitions() default -1; /** * 日志中打印的sql长度 */ int logSqlLength() default -1; /** * c3p0参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/Jdbc3.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行Jdbc connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-04-26 13:56:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface Jdbc3 { /** * Jdbc的url,同value */ String url(); /** * jdbc 驱动类,不填可根据url自动推断 */ String driver() default ""; /** * jdbc的用户名 */ String username(); /** * jdbc的密码 */ String password() default ""; /** * 事务的隔离级别 */ String isolationLevel() default ""; /** * 连接池的最大连接数 */ int maxPoolSize() default -1; /** * 连接池最少连接数 */ int minPoolSize() default -1; /** * 连接池初始连接数 */ int initialPoolSize() default -1; /** * 连接池的增量 */ int acquireIncrement() default -1; /** * 连接的最大空闲时间 */ int maxIdleTime() default -1; /** * 多少条操作一次 */ int batchSize() default -1; /** * flink引擎:flush的间隔周期(ms) */ long flushInterval() default -1; /** * flink引擎:失败最大重试次数 */ int maxRetries() default -1; /** * spark引擎:scan后的缓存级别:fire.jdbc.storage.level */ String storageLevel() default ""; /** * spark引擎:select后存放到rdd的多少个partition中:fire.jdbc.query.partitions */ int queryPartitions() default -1; /** * 日志中打印的sql长度 */ int logSqlLength() default -1; /** * c3p0参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/Jdbc4.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行Jdbc connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-06-16 14:56:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface Jdbc4 { /** * Jdbc的url,同value */ String url(); /** * jdbc 驱动类,不填可根据url自动推断 */ String driver() default ""; /** * jdbc的用户名 */ String username(); /** * jdbc的密码 */ String password() default ""; /** * 事务的隔离级别 */ String isolationLevel() default ""; /** * 连接池的最大连接数 */ int maxPoolSize() default -1; /** * 连接池最少连接数 */ int minPoolSize() default -1; /** * 连接池初始连接数 */ int initialPoolSize() default -1; /** * 连接池的增量 */ int acquireIncrement() default -1; /** * 连接的最大空闲时间 */ int maxIdleTime() default -1; /** * 多少条操作一次 */ int batchSize() default -1; /** * flink引擎:flush的间隔周期(ms) */ long flushInterval() default -1; /** * flink引擎:失败最大重试次数 */ int maxRetries() default -1; /** * spark引擎:scan后的缓存级别:fire.jdbc.storage.level */ String storageLevel() default ""; /** * spark引擎:select后存放到rdd的多少个partition中:fire.jdbc.query.partitions */ int queryPartitions() default -1; /** * 日志中打印的sql长度 */ int logSqlLength() default -1; /** * c3p0参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/Jdbc5.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行Jdbc connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-06-16 14:56:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface Jdbc5 { /** * Jdbc的url,同value */ String url(); /** * jdbc 驱动类,不填可根据url自动推断 */ String driver() default ""; /** * jdbc的用户名 */ String username(); /** * jdbc的密码 */ String password() default ""; /** * 事务的隔离级别 */ String isolationLevel() default ""; /** * 连接池的最大连接数 */ int maxPoolSize() default -1; /** * 连接池最少连接数 */ int minPoolSize() default -1; /** * 连接池初始连接数 */ int initialPoolSize() default -1; /** * 连接池的增量 */ int acquireIncrement() default -1; /** * 连接的最大空闲时间 */ int maxIdleTime() default -1; /** * 多少条操作一次 */ int batchSize() default -1; /** * flink引擎:flush的间隔周期(ms) */ long flushInterval() default -1; /** * flink引擎:失败最大重试次数 */ int maxRetries() default -1; /** * spark引擎:scan后的缓存级别:fire.jdbc.storage.level */ String storageLevel() default ""; /** * spark引擎:select后存放到rdd的多少个partition中:fire.jdbc.query.partitions */ int queryPartitions() default -1; /** * 日志中打印的sql长度 */ int logSqlLength() default -1; /** * c3p0参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/Kafka.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行Kafka connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-04-26 13:36:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface Kafka { /** * kafka集群连接信息,同value */ String brokers(); /** * kafka topics,多个使用逗号分隔 */ String topics(); /** * 消费者标识 */ String groupId(); /** * 指定从何处开始消费 */ String startingOffset() default ""; /** * 指定消费到何处结束 */ String endingOffsets() default ""; /** * 是否开启主动提交offset */ boolean autoCommit() default false; /** * session超时时间(ms) */ long sessionTimeout() default -1; /** * request超时时间(ms) */ long requestTimeout() default -1; /** * poll的周期(ms) */ long pollInterval() default -1; /** * 从指定的时间戳开始消费 */ long startFromTimestamp() default -1; /** * 指定从kafka中保持的offset开始继续消费 */ boolean startFromGroupOffsets() default false; /** * 是否强制覆盖checkpoint中保持的offset信息,从指定位置开始消费 */ boolean forceOverwriteStateOffset() default false; /** * 是否在开启checkpoint的情况下强制周期性提交offset到kafka */ boolean forceAutoCommit() default false; /** * 强制提交的周期(ms) */ long forceAutoCommitInterval() default -1; /** * kafka-client参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/Kafka2.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行Kafka connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-04-26 13:36:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface Kafka2 { /** * kafka集群连接信息 */ String brokers(); /** * kafka topics,多个使用逗号分隔 */ String topics(); /** * 消费者标识 */ String groupId(); /** * 指定从何处开始消费 */ String startingOffset() default ""; /** * 指定消费到何处结束 */ String endingOffsets() default ""; /** * 是否开启主动提交offset */ boolean autoCommit() default false; /** * session超时时间(ms) */ long sessionTimeout() default -1; /** * request超时时间(ms) */ long requestTimeout() default -1; /** * poll的周期(ms) */ long pollInterval() default -1; /** * 从指定的时间戳开始消费 */ long startFromTimestamp() default -1; /** * 指定从kafka中保持的offset开始继续消费 */ boolean startFromGroupOffsets() default false; /** * 是否强制覆盖checkpoint中保持的offset信息,从指定位置开始消费 */ boolean forceOverwriteStateOffset() default false; /** * 是否在开启checkpoint的情况下强制周期性提交offset到kafka */ boolean forceAutoCommit() default false; /** * 强制提交的周期(ms) */ long forceAutoCommitInterval() default -1; /** * kafka-client参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/Kafka3.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行Kafka connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-04-26 13:36:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface Kafka3 { /** * kafka集群连接信息 */ String brokers(); /** * kafka topics,多个使用逗号分隔 */ String topics(); /** * 消费者标识 */ String groupId(); /** * 指定从何处开始消费 */ String startingOffset() default ""; /** * 指定消费到何处结束 */ String endingOffsets() default ""; /** * 是否开启主动提交offset */ boolean autoCommit() default false; /** * session超时时间(ms) */ long sessionTimeout() default -1; /** * request超时时间(ms) */ long requestTimeout() default -1; /** * poll的周期(ms) */ long pollInterval() default -1; /** * 从指定的时间戳开始消费 */ long startFromTimestamp() default -1; /** * 指定从kafka中保持的offset开始继续消费 */ boolean startFromGroupOffsets() default false; /** * 是否强制覆盖checkpoint中保持的offset信息,从指定位置开始消费 */ boolean forceOverwriteStateOffset() default false; /** * 是否在开启checkpoint的情况下强制周期性提交offset到kafka */ boolean forceAutoCommit() default false; /** * 强制提交的周期(ms) */ long forceAutoCommitInterval() default -1; /** * kafka-client参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/Kafka4.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行Kafka connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-06-16 14:36:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface Kafka4 { /** * kafka集群连接信息 */ String brokers(); /** * kafka topics,多个使用逗号分隔 */ String topics(); /** * 消费者标识 */ String groupId(); /** * 指定从何处开始消费 */ String startingOffset() default ""; /** * 指定消费到何处结束 */ String endingOffsets() default ""; /** * 是否开启主动提交offset */ boolean autoCommit() default false; /** * session超时时间(ms) */ long sessionTimeout() default -1; /** * request超时时间(ms) */ long requestTimeout() default -1; /** * poll的周期(ms) */ long pollInterval() default -1; /** * 从指定的时间戳开始消费 */ long startFromTimestamp() default -1; /** * 指定从kafka中保持的offset开始继续消费 */ boolean startFromGroupOffsets() default false; /** * 是否强制覆盖checkpoint中保持的offset信息,从指定位置开始消费 */ boolean forceOverwriteStateOffset() default false; /** * 是否在开启checkpoint的情况下强制周期性提交offset到kafka */ boolean forceAutoCommit() default false; /** * 强制提交的周期(ms) */ long forceAutoCommitInterval() default -1; /** * kafka-client参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/Kafka5.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行Kafka connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-06-16 14:36:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface Kafka5 { /** * kafka集群连接信息 */ String brokers(); /** * kafka topics,多个使用逗号分隔 */ String topics(); /** * 消费者标识 */ String groupId(); /** * 指定从何处开始消费 */ String startingOffset() default ""; /** * 指定消费到何处结束 */ String endingOffsets() default ""; /** * 是否开启主动提交offset */ boolean autoCommit() default false; /** * session超时时间(ms) */ long sessionTimeout() default -1; /** * request超时时间(ms) */ long requestTimeout() default -1; /** * poll的周期(ms) */ long pollInterval() default -1; /** * 从指定的时间戳开始消费 */ long startFromTimestamp() default -1; /** * 指定从kafka中保持的offset开始继续消费 */ boolean startFromGroupOffsets() default false; /** * 是否强制覆盖checkpoint中保持的offset信息,从指定位置开始消费 */ boolean forceOverwriteStateOffset() default false; /** * 是否在开启checkpoint的情况下强制周期性提交offset到kafka */ boolean forceAutoCommit() default false; /** * 强制提交的周期(ms) */ long forceAutoCommitInterval() default -1; /** * kafka-client参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/RocketMQ.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行RocketMQ connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-04-26 15:18:34 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface RocketMQ { /** * rocketmq集群连接信息 */ String brokers(); /** * rocketmq topics,多个使用逗号分隔 */ String topics(); /** * 消费者标识 */ String groupId(); /** * 指定消费的tag */ String tag() default "*"; /** * 指定从何处开始消费 */ String startingOffset() default ""; /** * 是否开启主动提交offset */ boolean autoCommit() default false; /** * RocketMQ-client参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/RocketMQ2.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行RocketMQ connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-04-26 15:18:34 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface RocketMQ2 { /** * rocketmq集群连接信息 */ String brokers(); /** * rocketmq topics,多个使用逗号分隔 */ String topics(); /** * 消费者标识 */ String groupId(); /** * 指定消费的tag */ String tag() default ""; /** * 指定从何处开始消费 */ String startingOffset() default ""; /** * 是否开启主动提交offset */ boolean autoCommit() default false; /** * RocketMQ-client参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/RocketMQ3.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行RocketMQ connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-04-26 15:18:34 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface RocketMQ3 { /** * rocketmq集群连接信息 */ String brokers(); /** * rocketmq topics,多个使用逗号分隔 */ String topics(); /** * 消费者标识 */ String groupId(); /** * 指定消费的tag */ String tag() default ""; /** * 指定从何处开始消费 */ String startingOffset() default ""; /** * 是否开启主动提交offset */ boolean autoCommit() default false; /** * RocketMQ-client参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/RocketMQ4.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行RocketMQ connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-06-16 14:18:34 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface RocketMQ4 { /** * rocketmq集群连接信息 */ String brokers(); /** * rocketmq topics,多个使用逗号分隔 */ String topics(); /** * 消费者标识 */ String groupId(); /** * 指定消费的tag */ String tag() default ""; /** * 指定从何处开始消费 */ String startingOffset() default ""; /** * 是否开启主动提交offset */ boolean autoCommit() default false; /** * RocketMQ-client参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/connector/RocketMQ5.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.connector; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行RocketMQ connector配置,优先级低于配置文件,高于@Config注解 * * @author ChengLong 2022-06-16 14:18:34 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface RocketMQ5 { /** * rocketmq集群连接信息 */ String brokers(); /** * rocketmq topics,多个使用逗号分隔 */ String topics(); /** * 消费者标识 */ String groupId(); /** * 指定消费的tag */ String tag() default ""; /** * 指定从何处开始消费 */ String startingOffset() default ""; /** * 是否开启主动提交offset */ boolean autoCommit() default false; /** * RocketMQ-client参数,以key=value形式注明 */ String[] config() default ""; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/After.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注生命周期方法,在用户代码执行完成后调用,可用于资源释放 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface After { } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Before.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注生命周期方法,在引擎初始化前被调用执行,可用于资源初始化 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Before { } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Handle.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码入口方法,用法同@Process * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Handle { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Process.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码入口方法,用法同@Handle * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Process { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step1.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step1 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step10.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step10 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step11.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step11 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step12.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step12 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step13.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step13 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step14.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step14 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step15.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step15 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step16.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step16 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step17.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step17 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step18.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step18 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step19.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step19 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step2.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step2 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step3.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step3 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step4.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step4 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step5.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step5 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step6.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step6 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step7.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step7 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step8.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step8 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/anno/lifecycle/Step9.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.anno.lifecycle; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 标记注解:用于标注业务逻辑代码执行步骤 * * @author ChengLong 2022-08-09 09:49:12 * @since 2.3.2 */ @Target(ElementType.METHOD) @Retention(RetentionPolicy.RUNTIME) public @interface Step9 { /** * 业务代码逻辑描述 */ String value() default ""; /** * 当发生异常时,是否跳过异常执行下一步 */ boolean skipError() default false; } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/bean/ArthasParam.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.bean; /** * 用于承载或解析Arthas相关restful参数 * * @author ChengLong 2021-11-11 10:58:45 * @since 2.2.0 */ public class ArthasParam { private String command; private Boolean distribute; private String ip; public ArthasParam() { } public ArthasParam(String command, Boolean distribute, String ip) { this.command = command; this.distribute = distribute; this.ip = ip; } public String getCommand() { return command; } public void setCommand(String command) { this.command = command; } public Boolean getDistribute() { return distribute; } public void setDistribute(Boolean distribute) { this.distribute = distribute; } public String getIp() { return ip; } public void setIp(String ip) { this.ip = ip; } } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/task/SchedulerManager.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.task; import com.google.common.collect.Maps; import com.zto.fire.common.anno.Scheduled; import com.zto.fire.common.conf.FireFrameworkConf; import com.zto.fire.common.util.DateFormatUtils; import com.zto.fire.common.util.ReflectionUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.time.DateUtils; import org.quartz.*; import org.quartz.impl.StdSchedulerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.Serializable; import java.lang.reflect.Method; import java.util.Date; import java.util.Map; import java.util.Properties; import java.util.concurrent.atomic.AtomicBoolean; /** * 定时任务管理器,内部使用Quartz框架 * 为了适用于Spark,没有采用按包扫描的方式去扫描标记有@Scheduled的方法 * 而是要主动通过TaskManager.registerTasks注册,然后扫描该实例中所有标记 * 有@Scheduled的方法,并根据cron表达式定时执行 * * @author ChengLong 2019年11月4日 18:06:21 * @since 0.3.5 */ public abstract class SchedulerManager implements Serializable { // 用于指定当前spark任务的main方法所在的对象实例 private static Map taskMap; // 已注册的task列表 private static Map alreadyRegisteredTaskMap; // 定时调度实例 private static Scheduler scheduler; // 初始化标识 private static AtomicBoolean isInit = new AtomicBoolean(false); // 定时任务黑名单,存放带有@Scheduler标识的方法名 private static Map blacklistMap = Maps.newHashMap(); protected static final String DRIVER = "driver"; protected static final String EXECUTOR = "executor"; private static final String DEFAULT_COLOR = "\u001B[0m ] "; private static final Logger logger = LoggerFactory.getLogger(SchedulerManager.class); static { String blacklistMethod = FireFrameworkConf.schedulerBlackList(); if (StringUtils.isNotBlank(blacklistMethod)) { String[] methods = blacklistMethod.split(","); for (String method : methods) { if (StringUtils.isNotBlank(method)) { blacklistMap.put(method.trim(), method); } } } } protected SchedulerManager() {} /** * 初始化quartz */ protected static void init() { if (isInit.compareAndSet(false, true)) { taskMap = Maps.newConcurrentMap(); alreadyRegisteredTaskMap = Maps.newConcurrentMap(); try { StdSchedulerFactory factory = new StdSchedulerFactory(); Properties quartzProp = new Properties(); quartzProp.setProperty("org.quartz.threadPool.threadCount", FireFrameworkConf.quartzMaxThread()); factory.initialize(quartzProp); scheduler = factory.getScheduler(); } catch (Exception e) { logger.error("初始化quartz发生异常", e); } } } /** * 添加待执行的任务列表类实例 * * @param tasks 带有@Scheduled的类的实例 */ protected void addScanTask(Object... tasks) { if (tasks != null && tasks.length > 0) { for (Object task : tasks) { if (task != null) { taskMap.put(task.getClass().getName(), task); } } } } /** * 判断当前是否为driver * @return */ protected abstract String label(); /** * 将标记有@Scheduled的类实例注册给定时调度管理器 * 注:参数是类的实例而不是Class类型,是由于像Spark所在的object类型传入后,会被反射调用构造器创建另一个实例 * 为了保证当前Spark任务所在的Object实例只有一个,约定传入的参数必须是类的实例而不是Class类型 * * @param taskInstances 具有@Scheduled注解类的实例 */ public synchronized void registerTasks(Object... taskInstances) { try { if (!FireFrameworkConf.scheduleEnable()) { return; } SchedulerManager.init(); addScanTask(taskInstances); if (!taskMap.isEmpty()) { for (Map.Entry entry : taskMap.entrySet()) { // 已经注册过的任务不再重复注册 if (alreadyRegisteredTaskMap.containsKey(entry.getKey())) { continue; } Class clazz = entry.getValue().getClass(); if (clazz != null) { Method[] methods = clazz.getDeclaredMethods(); for (Method method : methods) { if (method != null) { ReflectionUtils.setAccessible(method); if (blacklistMap.containsKey(method.getName())) { continue; } Scheduled anno = method.getAnnotation(Scheduled.class); String label = label(); if (anno != null && StringUtils.isNotBlank(anno.scope()) && ("all".equalsIgnoreCase(anno.scope()) || anno.scope().equalsIgnoreCase(label))) { // 通过anno.concurrent判断是否使用并发任务实例 JobDetail job = (anno.concurrent() ? JobBuilder.newJob(TaskRunner.class) : JobBuilder.newJob(TaskRunnerQueue.class)).usingJobData(clazz.getName() + "#" + method.getName(), anno.cron()).build(); TriggerBuilder triggerBuilder = TriggerBuilder.newTrigger(); if (StringUtils.isNotBlank(anno.cron())) { // 优先执行cron表达式 triggerBuilder.withSchedule(CronScheduleBuilder.cronSchedule(anno.cron())); } else if (anno.fixedInterval() != -1) { // 固定频率的调度器 SimpleScheduleBuilder simpleScheduleBuilder = SimpleScheduleBuilder .simpleSchedule().withIntervalInMilliseconds(anno.fixedInterval()); // 设定重复执行的次数 long repeatCount = anno.repeatCount(); if (repeatCount == -1) { simpleScheduleBuilder.repeatForever(); } else { simpleScheduleBuilder.withRepeatCount((int) repeatCount - 1); } triggerBuilder.withSchedule(simpleScheduleBuilder); } // 用于指定任务首次执行的时间 if (StringUtils.isNotBlank(anno.startAt())) { // startAt优先级较高 triggerBuilder.startAt(DateFormatUtils.formatDateTime(anno.startAt())); } else { // 首次延迟多久(毫秒)开始执行 if (anno.initialDelay() == 0) { triggerBuilder.startNow(); } if (anno.initialDelay() != 0 && anno.initialDelay() != -1) { triggerBuilder.startAt(DateUtils.addMilliseconds(new Date(), (int) anno.initialDelay())); } } // 添加到调度任务中 if (scheduler == null) { scheduler = StdSchedulerFactory.getDefaultScheduler(); } scheduler.scheduleJob(job, triggerBuilder.build()); // 将已注册的task放到已注册标记列表中,防止重复注册同一个类的同一个定时方法 alreadyRegisteredTaskMap.put(entry.getKey(), entry.getValue()); String schedulerInfo = buildSchedulerInfo(anno); logger.info("\u001B[33m---> 已注册定时任务[ {}.{} ],{}. \u001B[33m<---\u001B[0m", entry.getKey(), method.getName(), schedulerInfo); } } } } } if (alreadyRegisteredTaskMap.size() > 0) { scheduler.start(); } } } catch (Exception e) { logger.error("定时任务注册失败:作为定时任务的类必须可序列化,并且标记有@Scheduled的方法必须是无参的!", e); } } /** * 用于描述定时任务的详细信息 * * @param anno Scheduled注解 * @return 描述信息 */ protected String buildSchedulerInfo(Scheduled anno) { if (anno == null) { return "Scheduled为空"; } StringBuilder schedulerInfo = new StringBuilder("\u001B[31m调度信息\u001B[0m"); if (StringUtils.isNotBlank(anno.scope())) { schedulerInfo.append("[ 范围=\u001B[32m").append(anno.scope()).append(DEFAULT_COLOR); } if (StringUtils.isNotBlank(anno.cron())) { schedulerInfo.append("[ 频率=\u001B[33m").append(anno.cron()).append(DEFAULT_COLOR); } else if (anno.fixedInterval() != -1) { schedulerInfo.append("[ 频率=\u001B[34m").append(anno.fixedInterval()).append(DEFAULT_COLOR); } if (anno.initialDelay() != -1) { schedulerInfo.append("[ 延迟=\u001B[35m").append(anno.initialDelay()).append(DEFAULT_COLOR); } if (StringUtils.isNotBlank(anno.startAt())) { schedulerInfo.append("[ 启动时间=\u001B[36m").append(anno.startAt()).append(DEFAULT_COLOR); } if (anno.repeatCount() != -1) { schedulerInfo.append("[ 重复=\u001B[32m").append(anno.repeatCount()).append("\u001B[0m次 ] "); } return schedulerInfo.toString(); } /** * 通过execute方法调用传入的指定类的指定方法 */ public static void execute(JobExecutionContext context) { try { JobDataMap dataMap = context.getJobDetail().getJobDataMap(); for (Map.Entry entry : dataMap.entrySet()) { String key = entry.getKey(); // 定时调用指定类的指定方法 if (StringUtils.isNotBlank(key) && key.contains("#")) { String[] classMethod = key.split("#"); Class clazz = Class.forName(classMethod[0]); Method method = clazz.getMethod(classMethod[1]); Object instance = taskMap.get(classMethod[0]); if (instance != null) { method.invoke(instance); } } } } catch (Exception e) { logger.error("执行定时任务发生异常", e); } } /** * 用于判断当前的定时调度器是否已启动 */ public synchronized boolean schedulerIsStarted() { if (scheduler == null) { return false; } try { return scheduler.isStarted(); } catch (Exception e) { logger.error("获取调度器是否启用失败", e); } return false; } /** * 关闭定时调度 * * @param waitForJobsToComplete 是否等待所有job全部执行完成再关闭 */ public static synchronized void shutdown(boolean waitForJobsToComplete) { try { if (scheduler != null && !scheduler.isShutdown()) { scheduler.shutdown(waitForJobsToComplete); scheduler = null; taskMap.clear(); alreadyRegisteredTaskMap.clear(); logger.info("\u001B[33m---> 完成定时任务的资源回收. <---\u001B[0m"); } } catch (Exception e) { logger.error("定时任务注册失败:作为定时任务的类必须可序列化,并且标记有@Scheduled的方法必须是无参的!", e); } } } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/task/TaskRunner.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.task; import org.quartz.Job; import org.quartz.JobExecutionContext; import org.quartz.JobExecutionException; import java.io.Serializable; /** * Scheduler TaskRunner * @author ChengLong 2019年11月5日 09:59:33 * @since 0.3.5 */ public class TaskRunner implements Job, Serializable { @Override public void execute(JobExecutionContext context) throws JobExecutionException { SchedulerManager.execute(context); } } ================================================ FILE: fire-core/src/main/java/com/zto/fire/core/task/TaskRunnerQueue.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.task; import org.quartz.DisallowConcurrentExecution; /** * 线程安全的方式执行定时任务,同一实例同一时刻只能有一个任务 * @author ChengLong 2019年11月5日 09:59:33 * @since 0.3.5 */ @DisallowConcurrentExecution public class TaskRunnerQueue extends TaskRunner { } ================================================ FILE: fire-core/src/main/resources/cluster.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # ----------------------------------------------- < \u96C6\u7FA4 \u914D\u7F6E > ------------------------------------------------ # # flink\u672C\u5730\u72B6\u6001\u8DEF\u5F84\u9009\u4E3E\u6240\u4F9D\u8D56\u7684zk\u5730\u5740 state.external.zookeeper.url = localhost:2181 # ----------------------------------------------- < kafka \u914D\u7F6E > ----------------------------------------------- # # kafka\u96C6\u7FA4\u540D\u79F0\u4E0E\u96C6\u7FA4\u5730\u5740\u6620\u5C04\uFF1Akafka.brokers.name=bigdata | kafka.brokers.name=zms fire.kafka.cluster.map.bigdata_test = kafka-server:9092 fire.kafka.cluster.map.oggKafka = kafka01:9092,kafka02:9092,kafka03:9092 # --------------------------------------------- < RocketMQ \u914D\u7F6E > ---------------------------------------------- # fire.rocket.cluster.map.zms = zms01:9876;zms02:9876;zms03:9876 fire.rocket.cluster.map.bigdata_test = rocketmq-server:9876 # -------------------------------------------- < spark-hive \u914D\u7F6E > --------------------------------------------- # # \u5B9E\u65F6\u96C6\u7FA4hive metastore\u5730\u5740\uFF08\u522B\u540D\uFF1Astreaming\uFF09 fire.hive.cluster.map.streaming = thrift://hive-streaming-thrift:9083 # \u6D4B\u8BD5\u96C6\u7FA4hive metastore\u5730\u5740\uFF08\u522B\u540D\uFF1Atest\uFF09 fire.hive.cluster.map.test = thrift://hive-thrift-server:9083 # ----------------------------------------------- < HDFS \u914D\u7F6E > ------------------------------------------------ # # \u7528\u4E8E\u662F\u5426\u542F\u7528HDFS HA hdfs.ha.enable = true # \u79BB\u7EBFhive\u96C6\u7FA4\u7684HDFS HA\u914D\u7F6E\u9879\uFF0C\u89C4\u5219\u4E3A\u7EDF\u4E00\u7684ha\u524D\u7F00\uFF1Aspark.hdfs.ha.conf.+hive.cluster\u540D\u79F0+hdfs\u4E13\u95E8\u7684ha\u914D\u7F6E hdfs.ha.conf.test.fs.defaultFS = hdfs://ns1 hdfs.ha.conf.test.dfs.nameservices = ns1 hdfs.ha.conf.test.dfs.ha.namenodes.ns1 = namenode310,namenode318 hdfs.ha.conf.test.dfs.namenode.rpc-address.ns1.namenode310 = node01:8020 hdfs.ha.conf.test.dfs.namenode.rpc-address.ns1.namenode318 = node03:8020 hdfs.ha.conf.test.dfs.client.failover.proxy.provider.ns1 = org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider # ----------------------------------------------- < HBase \u914D\u7F6E > ----------------------------------------------- # # \u79BB\u7EBF\u96C6\u7FA4hbase\u7684zk\u5730\u5740\uFF08\u522B\u540D\uFF1Abatch\uFF09 # fire.hbase.cluster.map.batch = zk01:2181,zk02:2181,zk03:2181 # \u6D4B\u8BD5\u96C6\u7FA4hbase\u7684zk\u5730\u5740\uFF08\u522B\u540D\uFF1Atest\uFF09 # fire.hbase.cluster.map.test = zk01:2181,zk02:2181,zk03:2181 # --------------------------------------------- < \u914D\u7F6E\u4E2D\u5FC3\u914D\u7F6E > --------------------------------------------- # fire.config_center.enable = false # \u672C\u5730\u8FD0\u884C\u73AF\u5883\u4E0B\uFF08Windows\u3001Mac\uFF09\u662F\u5426\u8C03\u7528\u914D\u7F6E\u4E2D\u5FC3\u63A5\u53E3\u83B7\u53D6\u914D\u7F6E\u4FE1\u606F\uFF0C\u4EFB\u52A1\u7EA7\u522B\u914D\u7F6E\u4E0D\u751F\u6548 fire.config_center.local.enable = false # \u914D\u7F6E\u4E2D\u5FC3\u63A5\u53E3\u8C03\u7528\u79D8\u94A5 fire.config_center.register.conf.secret = xxxxx # \u914D\u7F6E\u4E2D\u5FC3\u6CE8\u518C\u4E0E\u914D\u7F6E\u63A5\u53E3\u751F\u4EA7\u5730\u5740 fire.config_center.register.conf.prod.address = http://node01:8080/xx/xxx # \u914D\u7F6E\u4E2D\u5FC3\u6CE8\u518C\u4E0E\u914D\u7F6E\u63A5\u53E3\u6D4B\u8BD5\u5730\u5740 fire.config_center.register.conf.test.address = http://node02:8080/xx/xxx # arthas tunnel \u670D\u52A1\u7684ws\u5730\u5740 # fire.analysis.arthas.tunnel_server.url = ws://arthas-serer:7777/ws # ---------------------------------------------- < \u5F02\u5E38\u8BCA\u65AD\u914D\u7F6E > ----------------------------------------------- # fire.analysis.log.exception.send.mq.url = bigdata_test fire.analysis.log.exception.send.mq.topic = platform_realtime_analysis # ---------------------------------------------- < \u8840\u7F18\u91C7\u96C6\u914D\u7F6E > ----------------------------------------------- # fire.lineage.send.mq.url = bigdata_test fire.lineage.send.mq.topic = platform_realtime_lineage ================================================ FILE: fire-core/src/main/resources/fire.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # ----------------------------------------------- < fire \u914D\u7F6E > ------------------------------------------------ # # \u5F53\u524Dfire\u6846\u67B6\u7684\u7248\u672C\u53F7 fire.version = ${project.version} # fire\u5185\u7F6E\u7EBF\u7A0B\u6C60\u5927\u5C0F fire.thread.pool.size = 3 # fire\u5185\u7F6E\u5B9A\u65F6\u4EFB\u52A1\u7EBF\u7A0B\u6C60\u5927\u5C0F fire.thread.pool.schedule.size = 3 # \u662F\u5426\u542F\u7528restful\u670D\u52A1 fire.rest.enable = true # \u7528\u4E8E\u8BBE\u7F6E\u662F\u5426\u505A\u63A5\u53E3\u6821\u9A8C fire.rest.filter.enable = true # \u662F\u5426\u6253\u5370\u914D\u7F6E\u4FE1\u606F fire.conf.show.enable = false # fire.conf.deploy.engine=className\uFF0C\u5728\u4E0D\u540C\u5F15\u64CE\u5B9E\u73B0\u6A21\u5757\u4E2D\uFF0C\u6307\u5B9A\u5177\u4F53\u53EF\u83B7\u53D6\u914D\u7F6E\u4FE1\u606F\u7684EngineConf\u5B50\u7C7B\u5B9E\u73B0\uFF0C\u7528\u4E8E\u540C\u6B65\u914D\u7F6E\u5230\u5404container\u8282\u70B9 # \u662F\u5426\u6253\u5370restful\u5730\u5740 fire.rest.url.show.enable = false # \u662F\u5426\u542F\u7528hostname\u4F5C\u4E3Arest\u670D\u52A1\u7684\u8BBF\u95EE\u5730\u5740 fire.rest.url.hostname = false # \u662F\u5426\u5173\u95EDfire\u5185\u7F6E\u7684\u6240\u6709\u7D2F\u52A0\u5668 fire.acc.enable = true # \u65E5\u5FD7\u7D2F\u52A0\u5668\u5F00\u5173 fire.acc.log.enable = true # \u591A\u503C\u7D2F\u52A0\u5668\u5F00\u5173 fire.acc.multi.counter.enable = true # \u591A\u65F6\u95F4\u7EF4\u5EA6\u7D2F\u52A0\u5668\u5F00\u5173 fire.acc.multi.timer.enable = true # fire\u6846\u67B6\u57CB\u70B9\u65E5\u5FD7\u5F00\u5173\uFF0C\u5173\u95ED\u4EE5\u540E\u5C06\u4E0D\u518D\u6253\u5370\u57CB\u70B9\u65E5\u5FD7 fire.log.enable = true # \u7528\u4E8E\u9650\u5B9Afire\u6846\u67B6\u4E2Dsql\u65E5\u5FD7\u7684\u5B57\u7B26\u4E32\u957F\u5EA6 fire.log.sql.length = 100 # \u662F\u5426\u542F\u7528\u4E3Aconnector\u6CE8\u518Cshutdown hook\uFF0C\u5F53jvm\u9000\u51FA\u524Dclose fire.connector.shutdown_hook.enable = false # fire\u6846\u67B6\u9488\u5BF9jdbc\u64CD\u4F5C\u540E\u6570\u636E\u96C6\u7684\u7F13\u5B58\u7B56\u7565 fire.jdbc.storage.level = memory_and_disk_ser # \u901A\u8FC7JdbcConnector\u67E5\u8BE2\u540E\u5C06\u6570\u636E\u96C6\u653E\u5230\u591A\u5C11\u4E2A\u5206\u533A\u4E2D\uFF0C\u9700\u6839\u636E\u5B9E\u9645\u7684\u7ED3\u679C\u96C6\u505A\u914D\u7F6E fire.jdbc.query.partitions = 10 # \u662F\u5426\u542F\u7528\u5B9A\u65F6\u8C03\u5EA6 fire.task.schedule.enable = true # \u662F\u5426\u542F\u7528\u52A8\u6001\u914D\u7F6E fire.dynamic.conf.enable = true # fire\u6846\u67B6rest\u63A5\u53E3\u670D\u52A1\u6700\u5927\u7EBF\u7A0B\u6570 fire.restful.max.thread = 8 # quartz\u6700\u5927\u7EBF\u7A0B\u6C60\u5927\u5C0F fire.quartz.max.thread = 5 # fire\u6536\u96C6\u65E5\u5FD7\u4FDD\u7559\u7684\u6700\u5C11\u8BB0\u5F55\u6570 fire.acc.log.min.size = 500 # fire\u6536\u96C6\u65E5\u5FD7\u4FDD\u7559\u7684\u6700\u591A\u8BB0\u5F55\u6570 fire.acc.log.max.size = 1000 # timer\u7D2F\u52A0\u5668\u4FDD\u7559\u6700\u5927\u7684\u8BB0\u5F55\u6570 fire.acc.timer.max.size = 1000 # timer\u7D2F\u52A0\u5668\u6E05\u7406\u51E0\u5C0F\u65F6\u4E4B\u524D\u7684\u8BB0\u5F55 fire.acc.timer.max.hour = 12 # env\u7D2F\u52A0\u5668\u5F00\u5173 fire.acc.env.enable = true # env\u7D2F\u52A0\u5668\u4FDD\u7559\u6700\u591A\u7684\u8BB0\u5F55\u6570 fire.acc.env.max.size = 500 # env\u7D2F\u52A0\u5668\u4FDD\u7559\u6700\u5C11\u7684\u8BB0\u5F55\u6570 fire.acc.env.min.size = 100 # \u5B9A\u65F6\u8C03\u5EA6\u4EFB\u52A1\u9ED1\u540D\u5355\uFF0C\u914D\u7F6E\u7684value\u4E3A\u5B9A\u65F6\u4EFB\u52A1\u65B9\u6CD5\u540D\uFF0C\u591A\u4E2A\u4EE5\u9017\u53F7\u5206\u9694 fire.scheduler.blacklist = # \u914D\u7F6E\u6253\u5370\u9ED1\u540D\u5355\uFF0C\u5305\u542B\u8BE5\u914D\u7F6E\u5C06\u4E0D\u88AB\u6253\u5370 fire.conf.print.blacklist = .map.,pass,secret,zrc,connection,hdfs.ha,print.blacklist,yarn,namenode,metastore,address,redaction # fire\u6846\u67B6restful\u7AEF\u53E3\u51B2\u7A81\u91CD\u8BD5\u6B21\u6570 fire.restful.port.retry_num = 3 # fire\u6846\u67B6restful\u7AEF\u53E3\u51B2\u7A81\u91CD\u8BD5\u65F6\u95F4\uFF08ms\uFF09 fire.restful.port.retry_duration = 1000 # \u65E5\u5FD7\u7684\u7EA7\u522B\uFF0C\u7EDF\u4E00\u524D\u7F00\u4E3A\uFF1Afire.log.level.conf. fire.log.level.conf.org.apache.spark = INFO fire.log.level.conf.org.spark_project = INFO fire.log.level.conf.org.apache.kafka = WARN fire.log.level.conf.org.apache.zookeeper = WARN fire.log.level.conf.com.zto.fire = INFO fire.log.level.conf.org.eclipse.jetty.server = ERROR # \u662F\u5426\u5C06\u914D\u7F6E\u540C\u6B65\u5230executor\u3001taskmanager\u7AEF fire.deploy_conf.enable = true # \u6BCF\u4E2Ajvm\u5B9E\u4F8B\u5185\u90E8queue\u7528\u4E8E\u5B58\u653E\u5F02\u5E38\u5BF9\u8C61\u6570\u6700\u5927\u5927\u5C0F\uFF0C\u907F\u514D\u961F\u5217\u8FC7\u5927\u9020\u6210\u5185\u5B58\u6EA2\u51FA fire.exception_bus.size = 1000 # \u662F\u5426\u5F00\u542F\u5B9E\u65F6\u8840\u7F18\u91C7\u96C6 fire.lineage.enable = false # \u662F\u5426\u5F00\u542F\u5C06\u8840\u7F18\u4FE1\u606F\u53D1\u9001\u5230\u6D88\u606F\u961F\u5217 fire.lineage.send.mq.enable = false # \u8840\u7F18\u89E3\u6790\u5F02\u6B65\u7EBF\u7A0B\u6267\u884C\u7684\u6B21\u6570 fire.lineage.run.count = 360 # \u8840\u7F18\u89E3\u6790\u5B58\u653E\u7684\u961F\u5217\u6570\u6700\u5927\u503C fire.lineage.max.size = 200 # \u5B9A\u65F6\u89E3\u6790\u57CB\u70B9SQL\u7684\u521D\u59CB\u5EF6\u8FDF\uFF08s\uFF09 fire.lineage.run.initialDelay = 10 # \u5B9A\u65F6\u89E3\u6790\u57CB\u70B9SQL\u7684\u6267\u884C\u9891\u7387\uFF08s\uFF09 fire.lineage.run.period = 60 # \u7528\u4E8Ejdbc url\u7684\u8BC6\u522B\uFF0C\u5F53\u65E0\u6CD5\u901A\u8FC7driver class\u8BC6\u522B\u6570\u636E\u6E90\u65F6\uFF0C\u5C06\u4ECEurl\u4E2D\u7684\u7AEF\u53E3\u53F7\u8FDB\u884C\u533A\u5206\uFF0C\u4E0D\u540C\u6570\u636E\u914D\u7F6E\u4F7F\u7528\u7EDF\u4E00\u7684\u524D\u7F00\uFF1Afire.lineage.datasource.map. fire.lineage.datasource.map.tidb = 4000 # \u662F\u5426\u5F00\u542F\u914D\u7F6E\u81EA\u9002\u5E94\u524D\u7F00\uFF0C\u81EA\u52A8\u4E3A\u914D\u7F6E\u52A0\u4E0A\u5F15\u64CE\u524D\u7F00\uFF08spark.|flink.\uFF09 fire.conf.adaptive.prefix = true # \u7528\u6237\u7EDF\u4E00\u914D\u7F6E\u6587\u4EF6\uFF0C\u5141\u8BB8\u7528\u6237\u5728\u8BE5\u914D\u7F6E\u6587\u4EF6\u4E2D\u5B58\u653E\u516C\u5171\u7684\u914D\u7F6E\u4FE1\u606F\uFF0C\u4F18\u5148\u7EA7\u4F4E\u4E8E\u4EFB\u52A1\u914D\u7F6E\u6587\u4EF6\uFF08\u591A\u4E2A\u4EE5\u9017\u53F7\u5206\u9694\uFF09 fire.user.common.conf = common.properties # fire\u63A5\u53E3\u8BA4\u8BC1\u79D8\u94A5 fire.rest.server.secret = fire # \u662F\u5426\u5728\u8C03\u7528shutdown\u65B9\u6CD5\u65F6\u4E3B\u52A8\u9000\u51FAjvm\u8FDB\u7A0B fire.shutdown.auto.exit = true # \u8C03\u7528print\u6253\u5370\u65E5\u5FD7\u81F3\u591A\u591A\u5C11\u6761\uFF0C\u7528\u4E8E\u9650\u5236\u6BCF\u4E2Acontainer\u6253\u5370\u7684\u6570\u636E\u91CF\uFF0C\u907F\u514D\u5927\u65E5\u5FD7\u6587\u4EF6\u5360\u6EE1\u78C1\u76D8\uFF0C\u5F53\u503C\u5C0F\u4E8E\u7B49\u4E8E\u96F6\u65F6\u8868\u793A\u4E0D\u9650\u5236 fire.print.limit = 1000000 # \u7528\u4E8E\u6307\u5B9A\u5F53\u524D\u8FD0\u884C\u73AF\u5883\u662F\u5426\u4E3Alocal\u6A21\u5F0F\uFF08\u4E3B\u8981\u7528\u4E8Eflink-shell\u7684\u672C\u5730\u914D\u7F6E\u6587\u4EF6\u52A0\u8F7D\uFF09 # fire.env.local = false # \u662F\u5426\u542F\u7528\u57FA\u4E8E\u6CE8\u89E3\u8FDB\u884C\u914D\u7F6E\uFF08@Kafka\u3001@RocketMQ\u3001@Hive\u3001@HBase\u7B49\uFF09 fire.conf.annotation.enable = true # \u662F\u5426\u542F\u7528\u5F02\u5E38\u5806\u6808\u65E5\u5FD7\u7684\u91C7\u96C6\uFF0C\u4E00\u65E6\u5F00\u542F\uFF0C\u5C06\u81EA\u52A8\u91C7\u96C6sql+api\u7B49\u5F02\u5E38\u5806\u6808\uFF0C\u5E76\u53D1\u9001\u5230\u6307\u5B9A\u7684kafka topic\u4E2D fire.analysis.log.exception.stack.enable = false # \u5F02\u5E38\u4FE1\u606F\u53D1\u9001MQ\u5931\u8D25\u65F6\u6700\u5927\u91CD\u8BD5\u6B21\u6570 fire.analysis.log.exception.send.maxRetires = 10 # \u5F02\u5E38\u6D88\u606F\u53D1\u9001MQ\u8D85\u65F6\u65F6\u95F4 fire.analysis.log.exception.send.timeout = 3000 # \u662F\u5426\u81EA\u52A8\u63D0\u4EA4\u4EFB\u52A1 fire.job.autoStart = true # fire\u6846\u67B6\u540C\u6B65\u7D2F\u52A0\u5668\u5728\u6BCF\u4E2Acontainer\u7AEF\u5B58\u653E\u5B57\u7B26\u4E32\u7684\u6700\u5927\u8BB0\u5F55\u6570 fire.acc.sync.max.size = 100 # ----------------------------------------------- < kafka \u914D\u7F6E > ----------------------------------------------- # # kafka\u96C6\u7FA4\u540D\u79F0\u4E0E\u96C6\u7FA4\u5730\u5740\u6620\u5C04\uFF0C\u4EFB\u52A1\u4E2D\u901A\u8FC7kafka.brokers.name=local\u5373\u53EF\u8FDE\u5230\u4EE5\u4E0B\u914D\u7F6E\u7684broker\u5730\u5740 # fire.kafka.cluster.map.local = localhost:9092,localhost02:9092 # ----------------------------------------------- < hive \u914D\u7F6E > ------------------------------------------------ # # \u9ED8\u8BA4\u7684hive\u6570\u636E\u5E93 fire.hive.default.database.name = tmp # \u9ED8\u8BA4\u7684hive\u5206\u533A\u5B57\u6BB5\u540D\u79F0 fire.hive.table.default.partition.name = ds # \u79BB\u7EBF\u96C6\u7FA4hive metastore\u5730\u5740\uFF08\u522B\u540D\uFF1Alocal\uFF09\uFF0C\u4EFB\u52A1\u4E2D\u901A\u8FC7fire.hive.cluster=local\u5373\u53EF\u8FDE\u5230\u4E00\u4E0B\u914D\u7F6E\u7684thrift\u5730\u5740 # fire.hive.cluster.map.local = thrift://localhost:9083,thrift://localhost02:9083 # \u662F\u5426\u542F\u7528hive metastore url\u7684\u968F\u673A\u9009\u62E9 fire.hive.metastore.url.random.enable = true # ----------------------------------------------- < HBase \u914D\u7F6E > ----------------------------------------------- # # \u4E00\u6B21\u8BFB\u5199HBase\u7684\u6570\u636E\u91CF fire.hbase.batch.size = 10000 # fire\u6846\u67B6\u9488\u5BF9hbase\u64CD\u4F5C\u540E\u6570\u636E\u96C6\u7684\u7F13\u5B58\u7B56\u7565 fire.hbase.storage.level = memory_and_disk_ser # \u901A\u8FC7HBase scan\u540Erepartition\u7684\u5206\u533A\u6570\uFF0C\u9700\u6839\u636Escan\u540E\u7684\u6570\u636E\u91CF\u505A\u914D\u7F6E fire.hbase.scan.partitions = -1 # \u540E\u7EED\u7248\u672C\u4F1A\u5E9F\u5F03\uFF0C\u5E9F\u5F03\u540Efire.hbase.scan.partitions\u9ED8\u8BA4\u503C\u6539\u4E3A1200 fire.hbase.scan.repartitions = 1200 # \u662F\u5426\u5F00\u542FHBase\u8868\u5B58\u5728\u5224\u65AD\u7684\u7F13\u5B58\uFF0C\u5F00\u542F\u540E\u8868\u5B58\u5728\u5224\u65AD\u5C06\u907F\u514D\u5927\u91CF\u7684connection\u6D88\u8017 fire.hbase.table.exists.cache.enable = true # \u662F\u5426\u5F00\u542FHBase\u8868\u5B58\u5728\u5217\u8868\u7F13\u5B58\u7684\u5B9A\u65F6\u66F4\u65B0\u4EFB\u52A1 fire.hbase.table.exists.cache.reload.enable = true # \u5B9A\u65F6\u5237\u65B0\u7F13\u5B58HBase\u8868\u4EFB\u52A1\u7684\u521D\u59CB\u5EF6\u8FDF\uFF08s\uFF09 fire.hbase.table.exists.cache.initialDelay = 60 # \u5B9A\u65F6\u5237\u65B0\u7F13\u5B58HBase\u8868\u4EFB\u52A1\u7684\u6267\u884C\u9891\u7387\uFF08s\uFF09 fire.hbase.table.exists.cache.period = 600 # hbase\u96C6\u7FA4\u7684zk\u5730\u5740\uFF08\u522B\u540D\uFF1Alocal\uFF09\uFF0C\u4EFB\u52A1\u4E2D\u901A\u8FC7hbase.cluster=local\u5373\u53EF\u8FDE\u5230\u5BF9\u5E94\u7684hbase\u96C6\u7FA4 # fire.hbase.cluster.map.local = localhost:2181,localhost02:2181 # hbase connection \u914D\u7F6E\uFF0C\u7EA6\u5B9A\u4EE5\uFF1Afire.hbase.conf.\u5F00\u5934\uFF0C\u6BD4\u5982\uFF1Afire.hbase.conf.hbase.rpc.timeout\u5BF9\u5E94hbase\u4E2D\u7684\u914D\u7F6E\u4E3Ahbase.rpc.timeout fire.hbase.conf.hbase.zookeeper.property.clientPort = 2181 fire.hbase.conf.zookeeper.znode.parent = /hbase fire.hbase.conf.hbase.rpc.timeout = 600000 fire.hbase.conf.hbase.snapshot.master.timeoutMillis = 600000 fire.hbase.conf.hbase.snapshot.region.timeout = 600000 # ---------------------------------------------- < Arths \u914D\u7F6E > ----------------------------------------------- # # \u662F\u5426\u542F\u7528arthas\u4EE5\u4FBF\u4E8E\u5206\u6790\u4EFB\u52A1\u7684\u6027\u80FD fire.analysis.arthas.enable = false # \u662F\u5426\u5728container\u7AEF\u542F\u52A8arthas fire.analysis.arthas.container.enable = false # \u4EE5fire.analysis.arthas.conf.\u4E3A\u524D\u7F00\u7684\u914D\u7F6E\u652F\u6301arthas\u5168\u90E8\u7684\u53C2\u6570 # --------------------------------------------- < \u914D\u7F6E\u4E2D\u5FC3\u914D\u7F6E > --------------------------------------------- # # \u6CE8\uFF1A\u914D\u7F6E\u4E2D\u5FC3\u7CFB\u7EDF\u5F02\u5E38\u65F6\u53EF\u8BBE\u7F6E\u4E3Afalse\uFF0C\u4E0D\u53D7\u914D\u7F6E\u4E2D\u5FC3\u5F71\u54CD\uFF0C\u53EF\u6B63\u5E38\u53D1\u5E03\u548C\u8FD0\u884C\uFF0C\u4EFB\u52A1\u7EA7\u522B\u914D\u7F6E\u4E0D\u751F\u6548 fire.config_center.enable = true # \u672C\u5730\u8FD0\u884C\u73AF\u5883\u4E0B\uFF08Windows\u3001Mac\uFF09\u662F\u5426\u8C03\u7528\u914D\u7F6E\u4E2D\u5FC3\u63A5\u53E3\u83B7\u53D6\u914D\u7F6E\u4FE1\u606F\uFF0C\u4EFB\u52A1\u7EA7\u522B\u914D\u7F6E\u4E0D\u751F\u6548 fire.config_center.local.enable = false ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/Api.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core /** * Fire变量API * * @author ChengLong * @since 1.0.0 * @create 2021-01-12 17:16 */ private[fire] trait Api { /** * 流的启动 */ def start: Any } ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/BaseFire.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core import com.zto.fire.common.conf.{FireFrameworkConf, FirePS1Conf} import com.zto.fire.common.enu.JobType import com.zto.fire.common.util.{FireUtils, _} import com.zto.fire.core.anno.lifecycle.{After, Before} import com.zto.fire.core.conf.AnnoManager import com.zto.fire.core.plugin.ArthasManager import com.zto.fire.core.rest.{RestServerManager, SystemRestful} import com.zto.fire.core.task.SchedulerManager import com.zto.fire.predef._ import org.apache.log4j.{Level, Logger} import spark.Spark import java.util.concurrent.atomic.AtomicBoolean import scala.util.Try /** * 通用的父接口,提供通用的生命周期方法约束 * * @author ChengLong 2020年1月7日 09:20:02 * @since 0.4.1 */ trait BaseFire extends Logging { // 任务启动时间戳 protected[fire] val launchTime = FireUtils.launchTime // web ui地址 protected[fire] var webUI, applicationId: String = _ // main方法参数 protected[fire] var args: Array[String] = _ // 当前任务的类型标识 protected[fire] val jobType = JobType.UNDEFINED // fire框架内置的restful接口 private[fire] var systemRestful: SystemRestful = _ // restful接口注册 private[fire] var restfulRegister: RestServerManager = _ // 用于子类的锁状态判断,默认关闭状态 protected[fire] lazy val lock = new AtomicBoolean(false) // 是否已停止 protected[fire] lazy val isStoped = new AtomicBoolean(false) // 当前任务的类名(包名+类名) protected[fire] lazy val className: JString = this.getClass.getName.replace("$", "") // 当前任务的类名 protected[fire] lazy val driverClass: JString = this.getClass.getSimpleName.replace("$", "") // 默认的任务名称为类名 protected[fire] var appName: JString = this.driverClass // 配置信息 protected lazy val conf, $ = PropUtils this.boot() /** * 生命周期方法:初始化fire框架必要的信息 * 注:该方法会同时在driver端与executor端执行 */ private[fire] def boot(): Unit = { FireUtils.splash if (FireFrameworkConf.arthasEnable) ArthasManager.startArthas(this.resourceId, FireFrameworkConf.arthasContainerEnable) PropUtils.sliceKeys(FireFrameworkConf.FIRE_LOG_LEVEL_CONF_PREFIX).foreach(kv => Logger.getLogger(kv._1).setLevel(Level.toLevel(kv._2))) ExceptionBus.sendToMQ } /** * SQL语法校验,如果语法错误,则返回错误堆栈 * @param sql * sql statement */ def sqlValidate(sql: String): Try[Unit] /** * SQL语法校验 * @param sql * sql statement * @return * true:校验成功 false:校验失败 */ def sqlLegal(sql: String): Boolean /** * 获取任务的resourceId * @return * spark任务:driver/id flink任务:JobManager/container_xxx */ protected def resourceId: String /** * 在加载任务配置文件前将被加载 */ private[fire] def loadConf(): Unit = { // 加载配置文件 } /** * 用于将不同引擎的配置信息、累计器信息等传递到executor端或taskmanager端 */ protected def deployConf(): Unit = { // 用于在分布式环境下分发配置信息 } /** * 生命周期方法:用于在SparkSession初始化之前完成用户需要的动作 * 注:该方法会在进行init之前自动被系统调用 * * @param args * main方法参数 */ def before(args: Array[String]): Unit = { // 生命周期方法,在init之前被调用 AnnoManager.lifeCycleAnno(this, classOf[Before]) } /** * 生命周期方法:初始化运行信息 * * @param conf 配置信息 * @param args main方法参数 */ def init(conf: Any = null, args: Array[String] = null): Unit = { this.before(args) FireUtils._jobType = this.jobType this.logger.info(s" ${FirePS1Conf.YELLOW}---> 完成用户资源初始化,任务类型:${this.jobType.getJobTypeDesc} <--- ${FirePS1Conf.DEFAULT}") this.args = args this.createContext(conf) } /** * 创建计算引擎运行时环境 * * @param conf * 配置信息 */ private[fire] def createContext(conf: Any): Unit /** * 生命周期方法:具体的用户开发的业务逻辑代码 * 注:此方法会被自动调用,不需要在main中手动调用 */ def process(): Unit /** * 生命周期方法:依次调用process方法以及加了注解的业务逻辑处理方法 * 注:此方法会被自动调用,不需要在main中手动调用 */ protected[fire] def processAll: Unit = { tryWithLog({ this.process() AnnoManager.processAnno(this) }) (this.logger, "业务逻辑代码执行完成", "业务逻辑代码执行失败", isThrow = true) } /** * 生命周期方法:用于资源回收与清理,子类复写实现具体逻辑 * 注:该方法会在进行destroy之前自动被系统调用 */ def after(): Unit = { AnnoManager.lifeCycleAnno(this, classOf[After]) } /** * 生命周期方法:用于回收资源 */ def stop(): Unit /** * 生命周期方法:进行fire框架的资源回收 */ protected[fire] def shutdown(stopGracefully: Boolean = true, inListener: Boolean = false): Unit = { if (this.isStoped.compareAndSet(false, true)) { ThreadUtils.shutdown Spark.stop() SchedulerManager.shutdown(stopGracefully) this.logger.info(s" ${FirePS1Conf.YELLOW}---> 完成fire资源回收 <---${FirePS1Conf.DEFAULT}") this.logger.info(s"总耗时:${FirePS1Conf.RED}${elapsed(launchTime)}${FirePS1Conf.DEFAULT} The end...${FirePS1Conf.DEFAULT}") if (FireFrameworkConf.shutdownExit) System.exit(0) } } /** * 声明周期方法,禁止子类覆写 */ final protected def init: Unit = {} /** * 声明周期方法,禁止子类覆写 */ final protected def destory: Unit = {} /** * 初始化引擎上下文,如SparkSession、StreamExecutionEnvironment等 * 可根据实际情况,将配置参数放到同名的配置文件中进行差异化的初始化 */ def main(args: Array[String]): Unit = { this.init(null, args) } } ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/conf/AnnoManager.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.conf import com.google.common.collect.Sets import com.zto.fire.common.anno.Internal import com.zto.fire.common.conf.FireFrameworkConf.FIRE_LOG_SQL_LENGTH import com.zto.fire.common.conf.FireKafkaConf._ import com.zto.fire.common.conf.FireRocketMQConf._ import com.zto.fire.common.conf.{FireHiveConf, KeyNum} import com.zto.fire.common.util.{Logging, ReflectionUtils, StringsUtils} import com.zto.fire.core.BaseFire import com.zto.fire.core.anno.lifecycle.{Handle, Process, Step1, Step10, Step11, Step12, Step13, Step14, Step15, Step16, Step17, Step18, Step19, Step2, Step3, Step4, Step5, Step6, Step7, Step8, Step9} import com.zto.fire.core.anno._ import com.zto.fire.core.anno.connector.{HBase, HBase2, HBase3, HBase4, HBase5, Hive, Jdbc, Jdbc2, Jdbc3, Jdbc4, Jdbc5, Kafka, Kafka2, Kafka3, Kafka4, Kafka5, RocketMQ, RocketMQ2, RocketMQ3, RocketMQ4, RocketMQ5} import com.zto.fire.predef._ import org.apache.commons.lang3.StringUtils import java.lang.annotation.Annotation import scala.collection.mutable.HashMap /** * 注解管理器:用于将主键中的配置信息映射为键值对信息 * 注:解析指定的配置注解需要满足以下两个条件: * 1. 在registerAnnoSet中注册新的注解 * 2. 开发对应的map方法,如:mapHive解析@Hive、mapKafka解析@kafka注解 * * @author ChengLong 2022-04-26 11:19:00 * @since 2.2.2 */ @Internal private[fire] trait AnnoManager extends Logging { protected[fire] lazy val props = new HashMap[String, String]() this.register /** * 用于注册需要映射配置信息的自定义主键 */ @Internal protected[fire] def register: Unit /** * 将键值对配置信息存放到map中 * * @param key * 配置的key * @param value * 配置的value * @param keyNum * 配置key的数字结尾标识 */ @Internal protected def put(key: String, value: Any, keyNum: Int = KeyNum._1): this.type = { if (noEmpty(key, value)) { // 将配置中多余的空格去掉 val fixKey = StringUtils.trim(key) val fixValue = StringUtils.trim(value.toString) // 如果keyNum>1则将数值添加到key的结尾 val realKey = if (keyNum > 1) fixKey + keyNum else fixKey val isNumeric = StringsUtils.isNumeric(fixValue) // 约定注解中指定的配置的值如果为-1,表示不使用该项配置,通常-1表示默认值 if (!isNumeric || (isNumeric && fixValue.toLong != -1)) { this.props.put(realKey, fixValue) } } this } /** * 解析并将配置放入指定配置前缀的conf中 * * @param configPrefix * fire中定义的key的统一前缀 * @param config * 多个配置,同一行中的key value以等号分隔 */ @Internal private def putConfig(configPrefix: String, config: Array[String], keyNum: Int = KeyNum._1): Unit = { if (noEmpty(configPrefix, config)) { config.foreach(conf => { val kv = conf.split("=") if (kv != null && kv.length == 2) { this.put(s"${configPrefix}${kv(0).trim}", kv(1).trim, keyNum) } }) } } /** * 获取主键转为key value形式的Properties对象 */ @Internal def getAnnoProps(baseFire: Class[_]): HashMap[String, String] = { if (baseFire == null) return this.props // 获取入口类上所有的注解 val annotations = baseFire.getAnnotations val mapMethods = ReflectionUtils.getAllMethods(this.getClass) // 仅获取注册表中的注解配置信息 annotations.filter(anno => AnnoManager.registerAnnoSet.contains(anno.annotationType())).foreach(anno => { // 反射调用map+注解名称对应的方法: // 比如注解名称为Hive,则调用mapHive方法解析@Hive注解中的配置信息 val methodName = s"map${anno.annotationType().getSimpleName}" if (mapMethods.containsKey(methodName)) { mapMethods.get(methodName).invoke(this, anno) } }) this.props } /** * 用于映射Hbase相关配置信息 * * @param value * 对应注解中的value * @param config * 对应注解中的config */ @Internal private def mapHBaseConf(value: String, cluster: String, family: String, batchSize: Int, scanPartitions: Int, storageLevel: String, maxRetries: Int, durability: String, tableMetaCache: Boolean, config: Array[String], keyNum: Int = KeyNum._1): Unit = { this.put("hbase.cluster", value, keyNum) this.put("hbase.cluster", cluster, keyNum) this.put("hbase.column.family", family, keyNum) this.put("fire.hbase.batch.size", batchSize, keyNum) this.put("fire.hbase.scan.partitions", scanPartitions, keyNum) this.put("fire.hbase.storage.level", storageLevel, keyNum) this.put("hbase.max.retry", maxRetries, keyNum) this.put("hbase.durability", durability, keyNum) this.put("fire.hbase.table.exists.cache.enable", tableMetaCache, keyNum) this.putConfig("fire.hbase.conf.", config, keyNum) } /** * 将@HBase中配置的信息映射为键值对形式 * * @param HBase * HBase注解实例 */ @Internal def mapHBase(hbase: HBase): Unit = this.mapHBaseConf(hbase.value(), hbase.cluster(), hbase.family(), hbase.batchSize(), hbase.scanPartitions(), hbase.storageLevel(), hbase.maxRetries(), hbase.durability(), hbase.tableMetaCache(), hbase.config(), KeyNum._1) /** * 将@HBase中配置的信息映射为键值对形式 * * @param HBase2 * HBase注解实例 */ @Internal def mapHBase2(hbase: HBase2): Unit = this.mapHBaseConf(hbase.value(), hbase.cluster(), hbase.family(), hbase.batchSize(), hbase.scanPartitions(), hbase.storageLevel(), hbase.maxRetries(), hbase.durability(), hbase.tableMetaCache(), hbase.config(), KeyNum._2) /** * 将@HBase中配置的信息映射为键值对形式 * * @param HBase3 * HBase注解实例 */ @Internal def mapHBase3(hbase: HBase3): Unit = this.mapHBaseConf(hbase.value(), hbase.cluster(), hbase.family(), hbase.batchSize(), hbase.scanPartitions(), hbase.storageLevel(), hbase.maxRetries(), hbase.durability(), hbase.tableMetaCache(), hbase.config(), KeyNum._3) /** * 将@HBase中配置的信息映射为键值对形式 * * @param HBase4 * HBase注解实例 */ @Internal def mapHBase4(hbase: HBase4): Unit = this.mapHBaseConf(hbase.value(), hbase.cluster(), hbase.family(), hbase.batchSize(), hbase.scanPartitions(), hbase.storageLevel(), hbase.maxRetries(), hbase.durability(), hbase.tableMetaCache(), hbase.config(), KeyNum._4) /** * 将@HBase中配置的信息映射为键值对形式 * * @param HBase5 * HBase注解实例 */ @Internal def mapHBase5(hbase: HBase5): Unit = this.mapHBaseConf(hbase.value(), hbase.cluster(), hbase.family(), hbase.batchSize(), hbase.scanPartitions(), hbase.storageLevel(), hbase.maxRetries(), hbase.durability(), hbase.tableMetaCache(), hbase.config(), KeyNum._5) /** * 用于映射JDBC相关配置信息 * 对应注解中的@Jdbc */ @Internal def mapJdbcConf(url: String, driver: String, username: String, password: String, isolationLevel: String, maxPoolSize: Int, minPoolSize: Int, initialPoolSize: Int, acquireIncrement: Int, maxIdleTime: Int, batchSize: Int, flushInterval: Long, maxRetries: Int, storageLevel: String, queryPartitions: Int, logSqlLength: Int, config: Array[String], keyNum: Int = KeyNum._1): Unit = { this.put("db.jdbc.url", url, keyNum) // TODO: driver自动推断 // val autoDriver = if (noEmpty(driver)) driver else DBUtils this.put("db.jdbc.driver", driver, keyNum) this.put("db.jdbc.user", username, keyNum) this.put("db.jdbc.password", password, keyNum) this.put("db.jdbc.isolation.level", isolationLevel, keyNum) this.put("db.jdbc.maxPoolSize", maxPoolSize, keyNum) this.put("db.jdbc.minPoolSize", minPoolSize, keyNum) this.put("db.jdbc.initialPoolSize", initialPoolSize, keyNum) this.put("db.jdbc.acquireIncrement", acquireIncrement, keyNum) this.put("db.jdbc.maxIdleTime", maxIdleTime, keyNum) this.put("db.jdbc.batch.size", batchSize, keyNum) this.put("db.jdbc.flushInterval", flushInterval, keyNum) this.put("db.jdbc.max.retry", maxRetries, keyNum) this.put("fire.jdbc.storage.level", storageLevel, keyNum) this.put("fire.jdbc.query.partitions", queryPartitions, keyNum) this.put(FIRE_LOG_SQL_LENGTH, logSqlLength, keyNum) this.putConfig("db.c3p0.conf.", config, keyNum) } /** * 将@Jdbc中配置的信息映射为键值对形式 * * @param Jdbc * Jdbc注解实例 */ @Internal def mapJdbc(jdbc: Jdbc): Unit = { this.mapJdbcConf(jdbc.url(), jdbc.driver(), jdbc.username(), jdbc.password(), jdbc.isolationLevel(), jdbc.maxPoolSize(), jdbc.minPoolSize(), jdbc.initialPoolSize(), jdbc.acquireIncrement(), jdbc.maxIdleTime(), jdbc.batchSize(), jdbc.flushInterval(), jdbc.maxRetries(), jdbc.storageLevel(), jdbc.queryPartitions(), jdbc.logSqlLength(), jdbc.config(), KeyNum._1) } /** * 将@Jdbc中配置的信息映射为键值对形式 * * @param Jdbc * Jdbc注解实例 */ @Internal def mapJdbc2(jdbc: Jdbc2): Unit = { this.mapJdbcConf(jdbc.url(), jdbc.driver(), jdbc.username(), jdbc.password(), jdbc.isolationLevel(), jdbc.maxPoolSize(), jdbc.minPoolSize(), jdbc.initialPoolSize(), jdbc.acquireIncrement(), jdbc.maxIdleTime(), jdbc.batchSize(), jdbc.flushInterval(), jdbc.maxRetries(), jdbc.storageLevel(), jdbc.queryPartitions(), jdbc.logSqlLength(), jdbc.config(), KeyNum._2) } /** * 将@Jdbc中配置的信息映射为键值对形式 * * @param Jdbc3 * Jdbc注解实例 */ @Internal def mapJdbc3(jdbc: Jdbc3): Unit = { this.mapJdbcConf(jdbc.url(), jdbc.driver(), jdbc.username(), jdbc.password(), jdbc.isolationLevel(), jdbc.maxPoolSize(), jdbc.minPoolSize(), jdbc.initialPoolSize(), jdbc.acquireIncrement(), jdbc.maxIdleTime(), jdbc.batchSize(), jdbc.flushInterval(), jdbc.maxRetries(), jdbc.storageLevel(), jdbc.queryPartitions(), jdbc.logSqlLength(), jdbc.config(), KeyNum._3) } /** * 将@Jdbc中配置的信息映射为键值对形式 * * @param Jdbc4 * Jdbc注解实例 */ @Internal def mapJdbc4(jdbc: Jdbc4): Unit = { this.mapJdbcConf(jdbc.url(), jdbc.driver(), jdbc.username(), jdbc.password(), jdbc.isolationLevel(), jdbc.maxPoolSize(), jdbc.minPoolSize(), jdbc.initialPoolSize(), jdbc.acquireIncrement(), jdbc.maxIdleTime(), jdbc.batchSize(), jdbc.flushInterval(), jdbc.maxRetries(), jdbc.storageLevel(), jdbc.queryPartitions(), jdbc.logSqlLength(), jdbc.config(), KeyNum._4) } /** * 将@Jdbc中配置的信息映射为键值对形式 * * @param Jdbc5 * Jdbc注解实例 */ @Internal def mapJdbc5(jdbc: Jdbc5): Unit = { this.mapJdbcConf(jdbc.url(), jdbc.driver(), jdbc.username(), jdbc.password(), jdbc.isolationLevel(), jdbc.maxPoolSize(), jdbc.minPoolSize(), jdbc.initialPoolSize(), jdbc.acquireIncrement(), jdbc.maxIdleTime(), jdbc.batchSize(), jdbc.flushInterval(), jdbc.maxRetries(), jdbc.storageLevel(), jdbc.queryPartitions(), jdbc.logSqlLength(), jdbc.config(), KeyNum._5) } /** * 用于映射Kafka相关配置信息 */ @Internal private def mapKafkaConf(brokers: String, topics: String, groupId: String, startingOffset: String, endingOffsets: String, autoCommit: Boolean, sessionTimeout: Long, requestTimeout: Long, pollInterval: Long, startFromTimestamp: Long, startFromGroupOffsets: Boolean, forceOverwriteStateOffset: Boolean, forceAutoCommit: Boolean, forceAutoCommitInterval: Long, config: Array[String], keyNum: Int = KeyNum._1 ): Unit = { this.put(KAFKA_BROKERS_NAME, brokers, keyNum) this.put(KAFKA_TOPICS, topics, keyNum) this.put(KAFKA_GROUP_ID, groupId, keyNum) this.put(KAFKA_STARTING_OFFSET, startingOffset, keyNum) this.put(KAFKA_ENDING_OFFSET, endingOffsets, keyNum) this.put(KAFKA_ENABLE_AUTO_COMMIT, autoCommit, keyNum) this.put(KAFKA_SESSION_TIMEOUT_MS, sessionTimeout, keyNum) this.put(KAFKA_REQUEST_TIMEOUT_MS, requestTimeout, keyNum) this.put(KAFKA_MAX_POLL_INTERVAL_MS, pollInterval, keyNum) this.put(KAFKA_START_FROM_TIMESTAMP, startFromTimestamp, keyNum) this.put(KAFKA_START_FROM_GROUP_OFFSETS, startFromGroupOffsets, keyNum) this.put(KAFKA_OVERWRITE_STATE_OFFSET, forceOverwriteStateOffset, keyNum) this.put(KAFKA_FORCE_AUTO_COMMIT, forceAutoCommit, keyNum) this.put(KAFKA_FORCE_AUTO_COMMIT_INTERVAL, forceAutoCommitInterval, keyNum) this.putConfig(kafkaConfStart, config, keyNum) } /** * 将@Kafka中配置的信息映射为键值对形式 * * @param Kafka * Kafka注解实例 */ @Internal def mapKafka(kafka: Kafka): Unit = { this.mapKafkaConf(kafka.brokers(), kafka.topics(), kafka.groupId(), kafka.startingOffset(), kafka.endingOffsets(), kafka.autoCommit(), kafka.sessionTimeout(), kafka.requestTimeout(), kafka.pollInterval(), kafka.startFromTimestamp(), kafka.startFromGroupOffsets(), kafka.forceOverwriteStateOffset(), kafka.forceAutoCommit(), kafka.forceAutoCommitInterval(), kafka.config(), KeyNum._1 ) } /** * 将@Kafka中配置的信息映射为键值对形式 * * @param Kafka2 * Kafka注解实例 */ @Internal def mapKafka2(kafka: Kafka2): Unit = { this.mapKafkaConf(kafka.brokers(), kafka.topics(), kafka.groupId(), kafka.startingOffset(), kafka.endingOffsets(), kafka.autoCommit(), kafka.sessionTimeout(), kafka.requestTimeout(), kafka.pollInterval(), kafka.startFromTimestamp(), kafka.startFromGroupOffsets(), kafka.forceOverwriteStateOffset(), kafka.forceAutoCommit(), kafka.forceAutoCommitInterval(), kafka.config(), KeyNum._2 ) } /** * 将@Kafka中配置的信息映射为键值对形式 * * @param Kafka3 * Kafka注解实例 */ @Internal def mapKafka3(kafka: Kafka3): Unit = { this.mapKafkaConf(kafka.brokers(), kafka.topics(), kafka.groupId(), kafka.startingOffset(), kafka.endingOffsets(), kafka.autoCommit(), kafka.sessionTimeout(), kafka.requestTimeout(), kafka.pollInterval(), kafka.startFromTimestamp(), kafka.startFromGroupOffsets(), kafka.forceOverwriteStateOffset(), kafka.forceAutoCommit(), kafka.forceAutoCommitInterval(), kafka.config(), KeyNum._3 ) } /** * 将@Kafka中配置的信息映射为键值对形式 * * @param Kafka4 * Kafka注解实例 */ @Internal def mapKafka4(kafka: Kafka4): Unit = { this.mapKafkaConf(kafka.brokers(), kafka.topics(), kafka.groupId(), kafka.startingOffset(), kafka.endingOffsets(), kafka.autoCommit(), kafka.sessionTimeout(), kafka.requestTimeout(), kafka.pollInterval(), kafka.startFromTimestamp(), kafka.startFromGroupOffsets(), kafka.forceOverwriteStateOffset(), kafka.forceAutoCommit(), kafka.forceAutoCommitInterval(), kafka.config(), KeyNum._4 ) } /** * 将@Kafka中配置的信息映射为键值对形式 * * @param Kafka5 * Kafka注解实例 */ @Internal def mapKafka5(kafka: Kafka5): Unit = { this.mapKafkaConf(kafka.brokers(), kafka.topics(), kafka.groupId(), kafka.startingOffset(), kafka.endingOffsets(), kafka.autoCommit(), kafka.sessionTimeout(), kafka.requestTimeout(), kafka.pollInterval(), kafka.startFromTimestamp(), kafka.startFromGroupOffsets(), kafka.forceOverwriteStateOffset(), kafka.forceAutoCommit(), kafka.forceAutoCommitInterval(), kafka.config(), KeyNum._5 ) } /** * 将@RocketMQ中配置的信息映射为键值对形式 * * @param RocketMQ * RocketMQ注解实例 */ @Internal def mapRocketMQConf(brokers: String, topics: String, groupId: String, consumerTag: String, startingOffset: String, autoCommit: Boolean, config: Array[String], keyNum: Int = KeyNum._1): Unit = { this.put(ROCKET_BROKERS_NAME, brokers, keyNum) this.put(ROCKET_TOPICS, topics, keyNum) this.put(ROCKET_GROUP_ID, groupId, keyNum) this.put(ROCKET_CONSUMER_TAG, consumerTag, keyNum) this.put(ROCKET_STARTING_OFFSET, startingOffset, keyNum) this.put(ROCKET_ENABLE_AUTO_COMMIT, autoCommit, keyNum) this.putConfig(rocketConfStart, config, keyNum) } /** * 将@RocketMQ中配置的信息映射为键值对形式 * * @param RocketMQ * RocketMQ注解实例 */ @Internal def mapRocketMQ(rocketmq: RocketMQ): Unit = { this.mapRocketMQConf(rocketmq.brokers(), rocketmq.topics, rocketmq.groupId, rocketmq.tag, rocketmq.startingOffset, rocketmq.autoCommit, rocketmq.config, KeyNum._1) } /** * 将@RocketMQ2中配置的信息映射为键值对形式 * * @param RocketMQ2 * RocketMQ注解实例 */ @Internal def mapRocketMQ2(rocketmq: RocketMQ2): Unit = { this.mapRocketMQConf(rocketmq.brokers(), rocketmq.topics, rocketmq.groupId, rocketmq.tag, rocketmq.startingOffset, rocketmq.autoCommit, rocketmq.config, KeyNum._2) } /** * 将@RocketMQ3中配置的信息映射为键值对形式 * * @param RocketMQ3 * RocketMQ注解实例 */ @Internal def mapRocketMQ3(rocketmq: RocketMQ3): Unit = { this.mapRocketMQConf(rocketmq.brokers(), rocketmq.topics, rocketmq.groupId, rocketmq.tag, rocketmq.startingOffset, rocketmq.autoCommit, rocketmq.config, KeyNum._3) } /** * 将@RocketMQ3中配置的信息映射为键值对形式 * * @param RocketMQ4 * RocketMQ注解实例 */ @Internal def mapRocketMQ4(rocketmq: RocketMQ4): Unit = { this.mapRocketMQConf(rocketmq.brokers(), rocketmq.topics, rocketmq.groupId, rocketmq.tag, rocketmq.startingOffset, rocketmq.autoCommit, rocketmq.config, KeyNum._4) } /** * 将@RocketMQ3中配置的信息映射为键值对形式 * * @param RocketMQ5 * RocketMQ注解实例 */ @Internal def mapRocketMQ5(rocketmq: RocketMQ5): Unit = { this.mapRocketMQConf(rocketmq.brokers(), rocketmq.topics, rocketmq.groupId, rocketmq.tag, rocketmq.startingOffset, rocketmq.autoCommit, rocketmq.config, KeyNum._5) } /** * 将@Hive中配置的信息映射为键值对形式 * * @param Hive * Hive注解实例 */ @Internal def mapHive(hive: Hive): Unit = { if (noEmpty(hive.value())) this.put(FireHiveConf.HIVE_CLUSTER, hive.value()) if (noEmpty(hive.cluster())) this.put(FireHiveConf.HIVE_CLUSTER, hive.cluster()) if (noEmpty(hive.catalog())) this.put(FireHiveConf.HIVE_CATALOG_NAME, hive.catalog()) if (noEmpty(hive.version())) this.put(FireHiveConf.HIVE_VERSION, hive.version()) if (noEmpty(hive.partition())) this.put(FireHiveConf.DEFAULT_TABLE_PARTITION_NAME, hive.partition()) } } object AnnoManager extends Logging { // 用于存放注册了的主键,只解析这些主键中的信息 private[fire] lazy val registerAnnoSet = Sets.newHashSet[Class[_]]( classOf[Hive], classOf[HBase], classOf[HBase2], classOf[HBase3], classOf[HBase4], classOf[HBase5], classOf[Jdbc], classOf[Jdbc2], classOf[Jdbc3], classOf[Jdbc4], classOf[Jdbc5], classOf[Kafka], classOf[Kafka2], classOf[Kafka3], classOf[Kafka4], classOf[Kafka5], classOf[RocketMQ], classOf[RocketMQ2], classOf[RocketMQ3], classOf[RocketMQ4], classOf[RocketMQ5] ) // 用于注册所有的生命周期注解 private[fire] lazy val registerAnnoMethod = List[Class[_ <: Annotation]](classOf[Process], classOf[Handle], classOf[Step1], classOf[Step2], classOf[Step3], classOf[Step4], classOf[Step5], classOf[Step6], classOf[Step7], classOf[Step8], classOf[Step9], classOf[Step10], classOf[Step11], classOf[Step12], classOf[Step13], classOf[Step14], classOf[Step15], classOf[Step16], classOf[Step17], classOf[Step18], classOf[Step19]) /** * 用于调起生命周期注解所标记的方法 */ protected[fire] def processAnno(baseFire: BaseFire): Unit = { tryWithLog { ReflectionUtils.invokeStepAnnoMethod(baseFire, this.registerAnnoMethod: _*) } (this.logger, "业务逻辑代码执行完成", "业务逻辑代码执行失败", isThrow = true) } /** * 用于调用指定的被注解标记的声明周期方法 */ protected[fire] def lifeCycleAnno(baseFire: BaseFire, annoClass: Class[_ <: Annotation]): Unit = { tryWithLog { ReflectionUtils.invokeAnnoMethod(baseFire, annoClass) } (this.logger, "生命周期方法调用成功", "声明周期方法调用失败", isThrow = true) } } ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/connector/Connector.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.connector import com.zto.fire.common.conf.FireFrameworkConf import java.util.concurrent.ConcurrentHashMap import com.zto.fire.predef._ import com.zto.fire.common.util.ShutdownHookManager import org.slf4j.{Logger, LoggerFactory} /** * connector父接口,约定了open与close方法,子类需要根据具体 * 情况覆盖这两个方法。这两个方法不需要子类主动调用,会被自动调用 * * @author ChengLong * @since 2.0.0 * @create 2020-11-27 10:32 */ trait Connector extends Serializable { protected lazy val logger: Logger = LoggerFactory.getLogger(this.getClass) this.hook() /** * 用于注册释放资源 */ private[this] def hook(): Unit = { if (FireFrameworkConf.connectorShutdownHookEnable) { ShutdownHookManager.addShutdownHook() { () => { this.close() logger.info("release connector successfully.") } } } } /** * connector资源初始化 */ protected[fire] def open(): Unit = { this.logger.debug("init connector.") } /** * connector资源释放 */ protected def close(): Unit = { this.logger.debug("close connector.") } } /** * 支持多集群的connector * * @param keyNum * 对应的connector实例标识,不同的keyNum对应不同的集群连接实例 */ abstract class FireConnector(keyNum: Int = 1) extends Connector /** * 用于根据指定的keyNum创建不同的connector实例 */ abstract class ConnectorFactory[T <: Connector] extends Serializable { @transient private[fire] lazy val instanceMap = new ConcurrentHashMap[Int, T]() @transient protected lazy val logger: Logger = LoggerFactory.getLogger(this.getClass) /** * 约定创建connector子类实例的方法 */ protected def create(conf: Any = null, keyNum: Int = 1): T /** * 根据指定的keyNum返回单例的HBaseConnector实例 */ def getInstance(keyNum: Int = 1): T = this.instanceMap.get(keyNum) /** * 创建指定集群标识的connector对象实例 */ def apply(conf: Any = null, keyNum: Int = 1): T = { this.instanceMap.mergeGet(keyNum) { val instance: T = this.create(conf, keyNum) instance.open() instance } } } ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/ext/BaseFireExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.ext import com.zto.fire.common.util.Tools /** * 隐式转换基类 * * @author ChengLong * @since 2.0.0 * @create 2020-12-16 15:55 */ trait BaseFireExt extends Tools ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/ext/Provider.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.ext import com.zto.fire.common.util.Logging /** * 为上层扩展层提供api集合 * * @author ChengLong * @since 2.0.0 * @create 2020-12-23 17:52 */ trait Provider extends Logging { } ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/plugin/ArthasDynamicLauncher.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.plugin import com.zto.fire.predef._ import com.zto.fire.common.conf.FireFrameworkConf /** * Arthas启动器,可根据不同的引擎初始化不同的Arthas启动器实例 * * @author ChengLong 2021-11-11 11:09:02 * @since 2.2.0 */ private[fire] object ArthasDynamicLauncher extends ArthasLauncher { private lazy val launcher: ArthasLauncher = this.install /** * 根据不同的引擎初始化对应的Arthas启动器 * * @return */ private[this] def install: ArthasLauncher = { val launcher = FireFrameworkConf.arthasLauncher requireNonEmpty(launcher)(s"Arthas启动器不能为空,请通过${FireFrameworkConf.FIRE_ARTHAS_LAUNCHER}进行配置") this.logger.info(s"Arthas启动器${launcher}初始化成功!") Class.forName(launcher).newInstance().asInstanceOf[ArthasLauncher] } /** * 热启动Arthas * * @param isDistribute * 是否在每个container端启动arthas * @param ip * 仅在某些主机上启动 */ override def hotStart(isDistribute: Boolean, ip: String): Unit = this.launcher.hotStart(isDistribute, ip) /** * 分布式热关闭Arthas相关服务 * * @param isDistribute * 是否在每个container端停止arthas * @param ip * 仅在某些主机上启动 */ override def hotStop(isDistribute: Boolean, ip: String): Unit = this.launcher.hotStop(isDistribute, ip) /** * 分布式热重启rthas相关服务 * * @param isDistribute * 是否在每个container端停止arthas * @param ip * 仅在某些主机上启动 */ override def hotRestart(isDistribute: Boolean, ip: String): Unit = this.launcher.hotRestart(isDistribute, ip) } ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/plugin/ArthasLauncher.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.plugin import com.zto.fire.predef._ import com.zto.fire.common.util.Logging import com.zto.fire.core.bean.ArthasParam /** * Arthas启动器 * * @author ChengLong 2021-11-3 15:38:20 * @since 2.2.0 */ private[fire] trait ArthasLauncher extends Logging { /** * 统一管理,用于执行start、stop、restart等命令 * * @param param * 用于封装Arthas相关命令的参数 */ def command(param: ArthasParam): Unit = { requireNonEmpty(param, param.getCommand)("Arthas管理命令不能为空,请检查") param.getCommand match { case "start" => this.hotStart(param.getDistribute, param.getIp) case "stop" => this.hotStop(param.getDistribute, param.getIp) case "restart" => this.hotRestart(param.getDistribute, param.getIp) } } /** * 热启动Arthas * * @param isDistribute * 是否在每个container端启动arthas * @param ip * 仅在某些主机上启动 */ def hotStart(isDistribute: Boolean, ip: String): Unit /** * 分布式热关闭Arthas相关服务 * * @param isDistribute * 是否在每个container端停止arthas * @param ip * 仅在某些主机上启动 */ def hotStop(isDistribute: Boolean, ip: String): Unit /** * 分布式热重启rthas相关服务 * * @param isDistribute * 是否在每个container端停止arthas * @param ip * 仅在某些主机上启动 */ def hotRestart(isDistribute: Boolean, ip: String): Unit } ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/plugin/ArthasManager.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.plugin import com.taobao.arthas.agent.attach.ArthasAgent import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.util.{FireUtils, Logging, ReflectionUtils, ThreadUtils} import com.zto.fire.predef.{JHashMap, _} import java.util.concurrent.atomic.AtomicBoolean /** * Arthas插件管理 * * @author ChengLong 2021-11-2 14:45:43 * @since 2.2.0 */ private[fire] object ArthasManager extends Logging { private lazy val isStarted = new AtomicBoolean(false) private lazy val isStopped = new AtomicBoolean(true) // arthas启动需要消耗较长时间,使用inProcessing可避免在启动过程中执行stop/restart等命令 private lazy val inProcessing = new AtomicBoolean(false) /** * 启动Arthas服务 * * @param resourceId 用于标识分布式任务的master与slave * @param startContainer 是否在分布式环境下启用Arthas */ def startArthas(resourceId: String, startContainer: Boolean): Unit = { requireNonEmpty(resourceId)("resourceId不能为空,arthas所监控的程序必须有标识!") if (resourceId.contains("container") && !startContainer) return this.startArthas(resourceId) } /** * 关闭Arthas相关服务 */ def stopArthas: Unit = { if (this.isStopped.compareAndSet(false, true) && this.inProcessing.compareAndSet(false, true)) { this.logger.info("开始关闭Arthas相关服务") tryFinallyWithReturn { val bootstrap = ReflectionUtils.getFieldByName(classOf[ArthasAgent], "bootstrap").get(null) if (bootstrap != null) { val bootstrapClass = bootstrap.getClass ReflectionUtils.getMethodByName(bootstrapClass, "reset").invoke(bootstrap) ReflectionUtils.getMethodByName(bootstrapClass, "destroy").invoke(bootstrap) } } { this.isStarted.compareAndSet(true, false) this.inProcessing.compareAndSet(true, false) }(this.logger, "Arthas相关服务已关闭", "Arthas服务关闭失败!") } } /** * 启动Arthas服务 * * @param resourceId 用于标识分布式任务的master与slave */ def startArthas(resourceId: String): Unit = { if (this.isStarted.compareAndSet(false, true) && this.inProcessing.compareAndSet(false, true)) { this.logger.info("开始启动Arthas相关服务") ThreadUtils.run { tryWithLog { val configMap = new JHashMap[String, String]() configMap.put("arthas.appName", s"${FireUtils.engine}@${FireFrameworkConf.driverClassName}") configMap.put("arthas.telnetPort", "0") configMap.put("arthas.httpPort", "0") configMap.put("arthas.agentId", s"${FireUtils.engine}@${FireFrameworkConf.driverClassName}_$resourceId") configMap.put("arthas.tunnelServer", FireFrameworkConf.arthasTunnelServerUrl) configMap.put("arthas.username", "fire") configMap.put("arthas.password", FireFrameworkConf.driverClassName) configMap.putAll(FireFrameworkConf.arthasConfMap) ArthasAgent.attach(configMap) this.isStopped.compareAndSet(true, false) this.inProcessing.compareAndSet(true, false) }(this.logger, tryLog = "<-- Arthas服务已启动 -->") } } } /** * 重启Arthas相关服务 * * @param resourceId 用于标识分布式任务的master与slave */ def restartArthas(resourceId: String): Unit = { this.stopArthas this.startArthas(resourceId) } } ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/rest/RestCase.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.rest import spark.{Request, Response} /** * 用于封装rest的相关信息 * * @param method * rest的提交方式:GET/POST/PUT/DELETE等 * @param path * rest服务地址 * @author ChengLong 2019-3-16 09:58:06 */ private[fire] case class RestCase(method: String, path: String, fun: (Request, Response) => AnyRef) ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/rest/RestServerManager.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.rest import com.zto.fire.common.bean.rest.ResultMsg import com.zto.fire.common.conf.{FireFrameworkConf, FirePS1Conf} import com.zto.fire.common.enu.ErrorCode import com.zto.fire.common.util._ import com.zto.fire.predef._ import spark._ import java.net.ServerSocket import scala.collection.mutable._ /** * Fire框架的rest服务管理器 * * @author ChengLong 2019-3-16 09:56:56 */ private[fire] class RestServerManager extends Logging { private[this] var port: JInt = null private[this] var restPrefix: String = _ private[this] var socket: ServerSocket = _ private[this] lazy val restList = ListBuffer[RestCase]() private[this] lazy val mainClassName: String = FireFrameworkConf.driverClassName /** * 注册新的rest接口 * * @param rest * rest的封装信息 * @return */ private[fire] def addRest(rest: RestCase): this.type = { this.restList += rest this } /** * 获取Fire RestServer占用的端口号 */ def restPort: Int = this.port /** * 为rest服务指定监听端口 */ private[fire] def startRestPort(port: Int = 0): this.type = this.synchronized { if (this.port == null && !RestServerManager.isStarted) { Spark.threadPool(FireFrameworkConf.restfulMaxThread, 1, -1) // 端口占用失败默认重试3次 if (port == 0) { retry(FireFrameworkConf.restfulPortRetryNum, FireFrameworkConf.restfulPortRetryDuration) { val randomPort = OSUtils.getRundomPort Spark.port(randomPort) this.port = randomPort } } else { Spark.port(port) this.port = port } // 获取到未被占用的端口后,rest server不会立即绑定,为了避免被其他应用占用 // 此处使用ServerSocket占用该端口,等真正启动rest server前再关闭该ServerSocket以便释放端口 this.socket = new ServerSocket(this.port) // 接口地址:hostname还是以ip地址 val address = if (FireFrameworkConf.restUrlHostname) OSUtils.getHostName else OSUtils.getIp this.restPrefix = s"http://$address:${this.port}" PropUtils.setProperty(FireFrameworkConf.FIRE_REST_URL, s"$restPrefix") } this } /** * 注册并以子线程方式开启rest服务 */ private[fire] def startRestServer: Unit = this.synchronized { if (!FireFrameworkConf.restEnable || RestServerManager.isStarted) return RestServerManager.isStarted = true if (this.port == null) this.startRestPort() // 批量注册接口地址 ThreadUtils.run { // 释放Socket占用的端口给RestServer使用,避免被其他服务所占用 if (socket != null && !socket.isClosed) socket.close() restList.filter(_ != null).foreach(rest => { if (FireFrameworkConf.fireRestUrlShow) logger.info(s"---------> start rest: ${FirePS1Conf.wrap(restPrefix + rest.path, FirePS1Conf.BLUE, FirePS1Conf.UNDER_LINE)} successfully. <---------") rest.method match { case "get" | "GET" => Spark.get(rest.path, new Route { override def handle(request: Request, response: Response): AnyRef = { rest.fun(request, response) } }) case "post" | "POST" => Spark.post(rest.path, new Route { override def handle(request: Request, response: Response): AnyRef = { rest.fun(request, response) } }) case "put" | "PUT" => Spark.put(rest.path, new Route { override def handle(request: Request, response: Response): AnyRef = { rest.fun(request, response) } }) case "delete" | "DELETE" => Spark.delete(rest.path, new Route { override def handle(request: Request, response: Response): AnyRef = { rest.fun(request, response) } }) } }) // 注册过滤器,用于进行权限校验 Spark.before(new Filter { override def handle(request: Request, response: Response): Unit = { if (FireFrameworkConf.restFilter) { val msg = checkAuth(request) if (msg != null && msg.getCode != null && ErrorCode.UNAUTHORIZED == msg.getCode) { Spark.halt(401, msg.toString) } } } }) } } /** * 通过header进行用户权限校验 */ private[fire] def checkAuth(request: Request): ResultMsg = { val auth = request.headers("Authorization") try { if (!EncryptUtils.checkAuth(auth, this.mainClassName)) { this.logger.warn(s"非法请求:用户身份校验失败!ip=${request.ip()} auth=$auth") ResultMsg.buildError(s"非法请求:用户身份校验失败!ip=${request.ip()}", ErrorCode.UNAUTHORIZED) } } catch { case e: Exception => { this.logger.error(s"非法请求:请检查请求参数!ip=${request.ip()} auth=$auth", e) ResultMsg.buildError(s"非法请求:请检查请求参数!ip=${request.ip()}", ErrorCode.UNAUTHORIZED) } } null } } private[fire] object RestServerManager { private[RestServerManager] var isStarted = false /** * 用于判断fire rest是否启动 */ def serverStarted: Boolean = this.isStarted } ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/rest/SystemRestful.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.rest import com.zto.fire.common.anno.Rest import com.zto.fire.common.bean.analysis.ExceptionMsg import com.zto.fire.common.bean.rest.ResultMsg import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.enu.ErrorCode import com.zto.fire.common.util._ import com.zto.fire.core.BaseFire import com.zto.fire.core.bean.ArthasParam import com.zto.fire.core.plugin.ArthasDynamicLauncher import com.zto.fire.core.sync.SyncEngineConfHelper import com.zto.fire.predef.noEmpty import org.apache.commons.httpclient.Header import org.slf4j.{Logger, LoggerFactory} import spark.{Request, Response} import scala.collection.JavaConversions /** * 系统预定义的restful服务抽象 * * @author ChengLong 2020年4月2日 13:58:08 */ protected[fire] abstract class SystemRestful(engine: BaseFire) { protected lazy val logger: Logger = LoggerFactory.getLogger(this.getClass) this.register /** * 注册接口 */ protected def register: Unit /** * 获取当前任务所使用到的数据源信息 * * @return * 数据源列表 */ @Rest("/system/datasource") protected def datasource(request: Request, response: Response): AnyRef = { this.lineage(request, response) } /** * 获取当前任务所使用到的实时血缘信息 */ @Rest("/system/lineage") protected def lineage(request: Request, response: Response): AnyRef = { try { this.logger.info(s"Ip address ${request.ip()} request /system/lineage") val lineage = JSONUtils.toJSONString(SyncEngineConfHelper.syncLineage) this.logger.info(s"[lineage] 获取数据源列表成功:lineage=$lineage") ResultMsg.buildSuccess(lineage, "获取数据源列表成功") } catch { case e: Exception => { this.logger.error(s"[lineage] 获取实时血缘信息失败", e) ResultMsg.buildError("获取实时血缘信息失败", ErrorCode.ERROR) } } } /** * 启用Arthas进行性能诊断 * */ @Rest("/system/arthas") protected def arthas(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/arthas") this.logger.info(s"请求执行Arthas命令:$json") val arthasParam = JSONUtils.parseObject[ArthasParam](json) ArthasDynamicLauncher.command(arthasParam) this.logger.info(s"[arthas] Arthas命令${arthasParam.getCommand}执行成功!") ResultMsg.buildSuccess("操作成功", "调用arthas接口成功!") } catch { case e: Exception => { this.logger.error(s"[arthas] 调用arthas接口失败,参数不合法,请检查", e) ResultMsg.buildError("调用arthas接口失败,参数不合法,请检查", ErrorCode.ERROR) } } } /** * 异常信息采集接口 */ @Rest("/system/exception") protected def exception(request: Request, response: Response): AnyRef = { try { this.logger.info(s"Ip address ${request.ip()} request /system/exception") val msg = ExceptionBus.getAndClear val exceptions = msg._1.map(t => new ExceptionMsg(t._2, t._3)) logger.debug(s"异常诊断:本轮发送异常共计${msg._1.size}个.") ResultMsg.buildSuccess(JSONUtils.toJSONString(JavaConversions.seqAsJavaList(exceptions)), s"获取exception信息成功,共计:${exceptions.size}条") } catch { case e: Exception => { this.logger.error(s"调用exception接口失败,请检查", e) ResultMsg.buildError("调用exception接口失败,请检查", ErrorCode.ERROR) } } } } private[fire] object SystemRestful extends Logging { private var logCount = 0 /** * fire框架内部接口调用工具 * * @param urlSuffix * 接口后缀 * @param json * 请求参数 * @return * 接口响应结果 */ def restInvoke(urlSuffix: String, json: String = ""): String = { var response: String = "" if (FireFrameworkConf.restEnable && noEmpty(FireFrameworkConf.fireRestUrl, urlSuffix)) { val restful = FireFrameworkConf.fireRestUrl + urlSuffix try { val secret = EncryptUtils.md5Encrypt(FireFrameworkConf.dynamicKey) this.logger.debug(s"secret=${secret} restServerSecret=${FireFrameworkConf.restServerSecret} driverClassName=${FireFrameworkConf.driverClassName} date=${DateFormatUtils.formatCurrentDate}") response = if (noEmpty(json)) { HttpClientUtils.doPost(restful, json, new Header("Content-Type", "application/json"), new Header("Authorization", secret)) } else { HttpClientUtils.doGet(restful, new Header("Content-Type", "application/json"), new Header("Authorization", secret)) } } catch { case e: Exception => { if (this.logCount < 3) { this.logger.warn(s"fire内部接口自调用失败,对用户任务无影响,可忽略", e) this.logCount += 1 } } } } response } } ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/sql/SqlExtensionsParser.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.sql /** * fire 框架sql扩展与解析器 * * @author ChengLong * @date 2022-05-10 10:01:52 * @since 2.2.2 */ private[fire] trait SqlExtensionsParser { } ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/sql/SqlParser.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.sql import com.zto.fire.common.anno.Internal import com.zto.fire.common.bean.TableIdentifier import com.zto.fire.common.conf.FireFrameworkConf._ import com.zto.fire.common.util.{LineageManager, Logging, SQLLineageManager, TableMeta, ThreadUtils} import com.zto.fire.predef._ import java.util.concurrent.{CopyOnWriteArraySet, TimeUnit} /** * 用于各引擎的SQL解析 * * @author ChengLong 2021-6-18 16:28:50 * @since 2.0.0 */ @Internal private[fire] trait SqlParser extends Logging { // 用于临时存放解析后的库表类 protected[fire] lazy val tmpTableMap = new JHashMap[String, TableMeta]() // 用于存放按数据源归类后的所有血缘信息 protected lazy val tableMetaSet = new CopyOnWriteArraySet[TableMeta]() protected[fire] lazy val hiveTableMap = new JConcurrentHashMap[String, Boolean]() protected lazy val buffer = new CopyOnWriteArraySet[String]() this.sqlParse /** * 周期性的解析SQL语句 */ @Internal protected def sqlParse: Unit = { if (lineageEnable) { ThreadUtils.scheduleWithFixedDelay({ this.buffer.foreach(sql => this.sqlParser(sql)) LineageManager.addTableMeta(this.tableMetaSet) this.clear }, lineageRunInitialDelay, lineageRunPeriod, TimeUnit.SECONDS) } } /** * 将解析后的血缘信息临时存放,并通过catalog进行归类后统一收集 * * @param tableIdentifier * 库表名 */ @Internal protected def addTmpTableMeta(tableIdentifier: String, tmpTableMap: TableMeta): Unit = { this.tmpTableMap += (tableIdentifier -> tmpTableMap) this.collectTableMeta(tmpTableMap) } /** * 用于收集并按catalog归类数据源信息 * * @param tableMeta 数据源 */ @Internal private def collectTableMeta(tableMeta: TableMeta): Unit = this.tableMetaSet += tableMeta /** * 清理解析后的SQL数据 */ @Internal private[this] def clear: Unit = { this.buffer.clear() this.tmpTableMap.clear() this.tableMetaSet.clear() } /** * 将待解析的SQL添加到buffer中 */ @Internal def sqlParse(sql: String): Unit = { if (lineageEnable && noEmpty(sql)) { SQLLineageManager.addStatement(sql) this.buffer += sql } } /** * 用于解析给定的SQL语句 */ @Internal def sqlParser(sql: String): Unit /** * SQL语法校验 * @param sql * sql statement * @return * true:校验成功 false:校验失败 */ @Internal def sqlLegal(sql: String): Boolean /** * 用于判断给定的表是否为临时表 */ @Internal def isTempView(tableIdentifier: TableIdentifier): Boolean /** * 用于判断给定的表是否为hive表 */ @Internal def isHiveTable(tableIdentifier: TableIdentifier): Boolean /** * 将库表名转为字符串 */ @Internal def tableIdentifier(dbName: String, tableName: String): String = s"$dbName.$tableName" } ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/sync/LineageAccumulatorManager.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.sync import com.zto.fire.common.bean.lineage.Lineage import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.enu.Datasource import com.zto.fire.common.util.DatasourceDesc import com.zto.fire.predef._ import java.util.concurrent.ConcurrentHashMap import java.util.concurrent.atomic.AtomicLong /** * 用于将各个container端数据收集到master端 * * @author ChengLong 2022-08-24 14:16:08 * @since 2.3.2 */ trait LineageAccumulatorManager extends SyncManager { private lazy val accumulator = new ConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]]() private lazy val longCounter = new AtomicLong() /** * 将消息放到累加器中 */ def add(lineage: ConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]]): Unit = { if (FireFrameworkConf.accEnable) this.accumulator.putAll(lineage) } /** * 累加Long类型数据 */ def add(value: Long): Unit = this.longCounter.addAndGet(value) /** * 获取收集到的消息 */ def getValue: Lineage } ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/sync/SyncEngineConf.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.sync import com.zto.fire.common.bean.lineage.Lineage import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.enu.Datasource import com.zto.fire.common.util.{DatasourceDesc, ReflectionUtils} import com.zto.fire.predef._ import java.util.concurrent.atomic.AtomicBoolean import scala.collection.immutable /** * 用于获取不同计算引擎的全局配置信息,同步到fire框架中,并传递到每一个分布式实例 * * @author ChengLong * @since 2.0.0 * @create 2021-03-02 10:48 */ private[fire] trait SyncEngineConf extends SyncManager { protected val isCollect = new AtomicBoolean(false) this.collect /** * 获取引擎的所有配置信息(send) */ def syncEngineConf: Map[String, String] /** * 在master端获取系统累加器中的数据 */ def syncLineage: Lineage /** * 同步引擎各个container的信息到累加器中 */ def collect: Unit } /** * 用于获取不同引擎的配置信息 */ private[fire] object SyncEngineConfHelper extends SyncEngineConf { private lazy val syncEngineClass: Class[_] = try { Class.forName(FireFrameworkConf.confDeployEngine) } catch { case e: Exception => logger.error(s"未找到引擎配置获取实现类${FireFrameworkConf.confDeployEngine},无法进行配置同步", e) throw e } private lazy val instance = syncEngineClass.newInstance() /** * 通过反射获取不同引擎的配置信息 */ override def syncEngineConf: Map[String, String] = { if (syncEngineClass != null) { val method = syncEngineClass.getDeclaredMethod("syncEngineConf") ReflectionUtils.setAccessible(method) method.invoke(instance).asInstanceOf[immutable.Map[String, String]] } else Map.empty } /** * 同步引擎各个container的信息到master端(collect) */ override def syncLineage: Lineage = { val method = syncEngineClass.getDeclaredMethod("syncLineage") ReflectionUtils.setAccessible(method) method.invoke(instance).asInstanceOf[Lineage] } /** * 同步引擎各个container的信息到master端(collect) */ override def collect: Unit = { val method = syncEngineClass.getDeclaredMethod("collect") ReflectionUtils.setAccessible(method) method.invoke(instance) } } ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/sync/SyncManager.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.sync import com.zto.fire.common.util.Logging /** * 同步管理器: * 1. 用于Diver或JobManager端向Executor或TaskManager端同步数据 * 2. 用于将Executor或TaskManager端数据收集到driver或JobManager端 * * @author ChengLong 2021-11-2 15:41:30 * @since 2.2.0 */ trait SyncManager extends Logging { } ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/task/FireInternalTask.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.task import com.zto.fire.common.bean.runtime.RuntimeInfo import com.zto.fire.common.conf.{FireFrameworkConf, FirePS1Conf} import com.zto.fire.common.util.UnitFormatUtils.DateUnitEnum import com.zto.fire.common.util._ import com.zto.fire.core.BaseFire import com.zto.fire.predef._ import org.apache.commons.httpclient.Header import java.util.concurrent.atomic.AtomicBoolean /** * Fire框架内部的定时任务 * * @author ChengLong * @since 1.0.0 * @create 2020-07-14 11:02 */ private[fire] abstract class FireInternalTask(baseFire: BaseFire) extends Serializable with Logging { private[this] lazy val doJvmMonitor = new AtomicBoolean(true) protected lazy val registerLineageHook = new AtomicBoolean(false) /** * 定时采集运行时的jvm、gc、thread、cpu、memory、disk等信息 * 并将采集到的数据存放到EnvironmentAccumulator中 */ def jvmMonitor: Unit = { val runtimeInfo = RuntimeInfo.getRuntimeInfo if (runtimeInfo != null && logger != null && this.doJvmMonitor.get) { try { LogUtils.logStyle(this.logger, s"Jvm信息:${runtimeInfo.getIp}")(logger => { val jvmInfo = runtimeInfo.getJvmInfo val cpuInfo = runtimeInfo.getCpuInfo val threadInfo = runtimeInfo.getThreadInfo logger.info( s"""${FirePS1Conf.PINK} |GC -> YGC: ${jvmInfo.getMinorGCCount} YGCT: ${UnitFormatUtils.readable(jvmInfo.getMinorGCTime, UnitFormatUtils.TimeUnitEnum.MS)} FGC: ${jvmInfo.getFullGCCount} FGCT: ${UnitFormatUtils.readable(jvmInfo.getFullGCTime, UnitFormatUtils.TimeUnitEnum.MS)} |OnHeap -> Total: ${UnitFormatUtils.readable(jvmInfo.getMemoryTotal, DateUnitEnum.BYTE)} Used: ${UnitFormatUtils.readable(jvmInfo.getMemoryUsed, DateUnitEnum.BYTE)} Free: ${UnitFormatUtils.readable(jvmInfo.getMemoryFree, DateUnitEnum.BYTE)} HeapMax: ${UnitFormatUtils.readable(jvmInfo.getHeapMaxSize, DateUnitEnum.BYTE)} HeapUsed: ${UnitFormatUtils.readable(jvmInfo.getHeapUseSize, DateUnitEnum.BYTE)} Committed: ${UnitFormatUtils.readable(jvmInfo.getHeapCommitedSize, DateUnitEnum.BYTE)} |OffHeap -> Total: ${UnitFormatUtils.readable(jvmInfo.getNonHeapMaxSize, DateUnitEnum.BYTE)} Used: ${UnitFormatUtils.readable(jvmInfo.getNonHeapUseSize, DateUnitEnum.BYTE)} Committed: ${UnitFormatUtils.readable(jvmInfo.getNonHeapCommittedSize, DateUnitEnum.BYTE)} |CPUInfo -> Load: ${cpuInfo.getCpuLoad} LoadAverage: ${cpuInfo.getLoadAverage.mkString(",")} IoWait: ${cpuInfo.getIoWait} IrqTick: ${cpuInfo.getIrqTick} |Thread -> Total: ${threadInfo.getTotalCount} TotalStarted: ${threadInfo.getTotalStartedCount} Peak: ${threadInfo.getPeakCount} Deamon: ${threadInfo.getDeamonCount} CpuTime: ${UnitFormatUtils.readable(threadInfo.getCpuTime, UnitFormatUtils.TimeUnitEnum.MS)} UserTime: ${UnitFormatUtils.readable(threadInfo.getUserTime, UnitFormatUtils.TimeUnitEnum.MS)} ${FirePS1Conf.DEFAULT} |""".stripMargin) }) } catch { case _: Throwable => this.doJvmMonitor.set(false) } } } /** * 实时血缘发送定时任务,定时将血缘信息发送到kafka中 */ def lineage: Unit /** * 注册血缘shutdown hook,确保退出jvm前发送血缘信息到消息队列 */ def registerLineageHook(block: => Unit): Unit = { if (this.registerLineageHook.compareAndSet(false, true)) { ShutdownHookManager.addShutdownHook(ShutdownHookManager.HEIGHT_PRIORITY)(() => block) } } } ================================================ FILE: fire-core/src/main/scala/com/zto/fire/core/util/SingletonFactory.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.core.util import com.zto.fire.common.util.ValueUtils /** * 单例工厂 * * @author ChengLong * @since 2.0.0 * @create 2020-12-18 14:02 */ private[fire] trait SingletonFactory { @transient protected[this] var appName: String = _ /** * 设置TableEnv实例 */ protected[fire] def setAppName(appName: String): this.type = { if (ValueUtils.noEmpty(appName) && ValueUtils.isEmpty(this.appName)) this.appName = appName this } } ================================================ FILE: fire-engines/.gitignore ================================================ # use glob syntax. syntax: glob *.ser *.class *~ *.bak #*.off *.old # eclipse conf file .settings .classpath .project .manager .scala_dependencies # idea .idea *.iml # building target build null tmp* temp* dist test-output build.log # other scm .svn .CVS .hg* # switch to regexp syntax. # syntax: regexp # ^\.pc/ #SHITTY output not in target directory build.log ================================================ FILE: fire-engines/fire-flink/.gitignore ================================================ # use glob syntax. syntax: glob *.ser *.class *~ *.bak #*.off *.old # eclipse conf file .settings .classpath .project .manager .scala_dependencies # idea .idea *.iml # building target build null tmp* temp* dist test-output build.log # other scm .svn .CVS .hg* # switch to regexp syntax. # syntax: regexp # ^\.pc/ #SHITTY output not in target directory build.log ================================================ FILE: fire-engines/fire-flink/pom.xml ================================================ 4.0.0 fire-flink_${flink.reference} jar Fire : Engines : Flink com.zto.fire fire-engines 2.3.2-SNAPSHOT ../pom.xml com.zto.fire fire-enhance-flink_${flink.reference} ${fire.version} ${maven.scope} com.zto.fire fire-connector-flink-rocketmq_${flink.reference} ${fire.version} ${maven.scope} com.sparkjava spark-core ${sparkjava.version} ${maven.scope} javax.servlet javax.servlet-api 3.1.0 org.apache.flink flink-java ${flink.version} ${maven.scope} org.apache.flink flink-scala_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-streaming-scala_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-clients_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-runtime-web_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-queryable-state-client-java ${flink.version} ${maven.scope} org.apache.flink flink-statebackend-rocksdb_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-connector-kafka_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.kafka kafka_${scala.binary.version} ${kafka.version} ${maven.scope} org.apache.flink flink-table-api-java ${flink.version} ${maven.scope} org.apache.flink flink-table-api-java-bridge_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-table-api-scala-bridge_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-table-common ${flink.version} ${maven.scope} org.apache.flink flink-connector-hive_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-connector-jdbc_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-connector-elasticsearch-base_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-hadoop-compatibility_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-shaded-hadoop-2-uber 2.6.5-8.0 ${maven.scope} org.apache.hive hive-exec ${hive.flink.version} ${maven.scope} org.apache.hbase hbase-common ${hbase.version} ${maven.scope} org.apache.hbase hbase-client org.apache.hbase hbase-server ${hbase.version} ${maven.scope} org.apache.hbase hbase-client org.apache.hbase hbase-client ${hbase.version} ${maven.scope} org.apache.rocketmq rocketmq-client ${rocketmq.version} ${maven.scope} org.apache.rocketmq rocketmq-acl ${rocketmq.version} ${maven.scope} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-engines/fire-flink/src/main/java/com/zto/fire/flink/anno/Checkpoint.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.anno; import org.apache.flink.streaming.api.CheckpointingMode; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; import static org.apache.flink.streaming.api.environment.CheckpointConfig.*; /** * 基于注解flink checkpoint配置,优先级低于配置文件,高于@Config注解 * 注:@Checkpoint中相关时间单位均为秒 * * @author ChengLong 2022-04-26 11:16:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface Checkpoint { /** * checkpoint周期(s) */ int value() default 60; /** * checkpoint周期(s),同value */ int interval() default -1; /** * checkpoint超时时间(s) */ int timeout() default -1; /** * 是否开启非对齐的checkpoint */ boolean unaligned() default true; /** * checkpoint的并发度 */ int concurrent() default -1; /** * 两次checkpoint的最短时间间隔 */ int pauseBetween() default -1; /** * 运行checkpoint失败的总次数 */ int failureNumber() default -1; /** * checkpoint的模式 */ CheckpointingMode mode() default CheckpointingMode.EXACTLY_ONCE; /** * 当任务停止时checkpoint的保持策略 */ ExternalizedCheckpointCleanup cleanup() default ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION; } ================================================ FILE: fire-engines/fire-flink/src/main/java/com/zto/fire/flink/anno/FlinkConf.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.anno; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行任务的配置,支持纯注解方式进行Flink相关参数配置 * * @author ChengLong 2022-08-18 08:57:23 * @since 2.3.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface FlinkConf { /** * 配置项列表,key=value的字符串形式 */ String[] props() default ""; /** * 配置的字符串 */ String value() default ""; } ================================================ FILE: fire-engines/fire-flink/src/main/java/com/zto/fire/flink/anno/Streaming.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.anno; import org.apache.flink.streaming.api.CheckpointingMode; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; import static org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup; /** * 基于注解flink配置,优先级低于配置文件,高于@Config注解低于@Checkpoint注解 * * @author ChengLong 2022-04-26 11:16:00 * @since 2.2.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface Streaming { /** * checkpoint周期(s) */ int value() default 60; /** * checkpoint周期(s),同value */ int interval() default -1; /** * checkpoint超时时间(s) */ int timeout() default -1; /** * 是否开启非对齐的checkpoint */ boolean unaligned() default true; /** * checkpoint的并发度 */ int concurrent() default -1; /** * 两次checkpoint的最短时间间隔 */ int pauseBetween() default -1; /** * 运行checkpoint失败的总次数 */ int failureNumber() default -1; /** * checkpoint的模式 */ CheckpointingMode mode() default CheckpointingMode.EXACTLY_ONCE; /** * 当任务停止时checkpoint的保持策略 */ ExternalizedCheckpointCleanup cleanup() default ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION; /** * 是否自动提交job:call env.execute() */ boolean autoStart() default true; /** * 任务的并行度 */ int parallelism() default -1; /** * 是否禁用OperatorChaining */ boolean disableOperatorChaining() default false; /** * 状态的TTL时间(day) */ int stateTTL() default 31; } ================================================ FILE: fire-engines/fire-flink/src/main/java/com/zto/fire/flink/bean/CheckpointParams.java ================================================ package com.zto.fire.flink.bean; /** * checkpoint热修改参数 * {"interval":10000, "timeout":20000, "minPauseBetween": 10000} * @author ChengLong 2019-5-5 16:57:49 */ public class CheckpointParams { /** * checkpoint的频率 */ private Long interval; /** * checkpoint的超时时间 */ private Long timeout; /** * 两次checkpoint的最短时间间隔 */ private Long minPauseBetween; public Long getInterval() { return interval; } public void setInterval(Long interval) { this.interval = interval; } public Long getTimeout() { return timeout; } public void setTimeout(Long timeout) { this.timeout = timeout; } public Long getMinPauseBetween() { return minPauseBetween; } public void setMinPauseBetween(Long minPauseBetween) { this.minPauseBetween = minPauseBetween; } } ================================================ FILE: fire-engines/fire-flink/src/main/java/com/zto/fire/flink/bean/DistributeBean.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.bean; import com.zto.fire.flink.enu.DistributeModule; /** * 用于解析restful中的参数 * * @author ChengLong 2021-11-11 09:32:00 * @since 2.2.0 */ public class DistributeBean { private DistributeModule module; private String json; public DistributeBean() { } public DistributeBean(DistributeModule module, String json) { this.module = module; this.json = json; } public DistributeModule getModule() { return module; } public void setModule(DistributeModule module) { this.module = module; } public String getJson() { return json; } public void setJson(String json) { this.json = json; } } ================================================ FILE: fire-engines/fire-flink/src/main/java/com/zto/fire/flink/bean/FlinkTableSchema.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.bean; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.table.api.DataTypes; import org.apache.flink.table.api.TableException; import org.apache.flink.table.api.TableSchema; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.utils.TypeConversions; import org.apache.flink.util.Preconditions; import java.io.Serializable; import java.util.*; import static org.apache.flink.table.types.utils.TypeConversions.fromDataTypeToLegacyInfo; /** * flink表模式,支持序列化 * @author ChengLong 2020年1月16日 16:56:23 */ public class FlinkTableSchema implements Serializable { private final String[] fieldNames; private final DataType[] fieldDataTypes; private final Map fieldNameToIndex; public FlinkTableSchema(TableSchema schema) { this(schema.getFieldNames(), schema.getFieldDataTypes()); } private FlinkTableSchema(String[] fieldNames, DataType[] fieldDataTypes) { this.fieldNames = Preconditions.checkNotNull(fieldNames); this.fieldDataTypes = Preconditions.checkNotNull(fieldDataTypes); fieldNameToIndex = new HashMap<>(); final Set duplicateNames = new HashSet<>(); final Set uniqueNames = new HashSet<>(); for (int i = 0; i < fieldNames.length; i++) { // check for null Preconditions.checkNotNull(fieldDataTypes[i]); final String fieldName = Preconditions.checkNotNull(fieldNames[i]); // collect indices fieldNameToIndex.put(fieldName, i); // check uniqueness of field names if (uniqueNames.contains(fieldName)) { duplicateNames.add(fieldName); } else { uniqueNames.add(fieldName); } } if (!duplicateNames.isEmpty()) { throw new TableException( "Field names must be unique.\n" + "List of duplicate fields: " + duplicateNames.toString() + "\n" + "List of all fields: " + Arrays.toString(fieldNames)); } } /** * Returns all field data types as an array. */ public DataType[] getFieldDataTypes() { return fieldDataTypes; } /** * This method will be removed in future versions as it uses the old type system. It * is recommended to use {@link #getFieldDataTypes()} instead which uses the new type * system based on {@link DataTypes}. Please make sure to use either the old or the new * type system consistently to avoid unintended behavior. See the website documentation * for more information. */ public TypeInformation[] getFieldTypes() { return fromDataTypeToLegacyInfo(fieldDataTypes); } /** * Returns the specified data type for the given field index. * * @param fieldIndex the index of the field */ public Optional getFieldDataType(int fieldIndex) { if (fieldIndex < 0 || fieldIndex >= fieldDataTypes.length) { return Optional.empty(); } return Optional.of(fieldDataTypes[fieldIndex]); } /** * This method will be removed in future versions as it uses the old type system. It * is recommended to use {@link #getFieldDataType(int)} instead which uses the new type * system based on {@link DataTypes}. Please make sure to use either the old or the new * type system consistently to avoid unintended behavior. See the website documentation * for more information. */ public Optional> getFieldType(int fieldIndex) { return getFieldDataType(fieldIndex) .map(TypeConversions::fromDataTypeToLegacyInfo); } /** * Returns the specified data type for the given field name. * * @param fieldName the name of the field */ public Optional getFieldDataType(String fieldName) { if (fieldNameToIndex.containsKey(fieldName)) { return Optional.of(fieldDataTypes[fieldNameToIndex.get(fieldName)]); } return Optional.empty(); } /** * This method will be removed in future versions as it uses the old type system. It * is recommended to use {@link #getFieldDataType(String)} instead which uses the new type * system based on {@link DataTypes}. Please make sure to use either the old or the new * type system consistently to avoid unintended behavior. See the website documentation * for more information. */ public Optional> getFieldType(String fieldName) { return getFieldDataType(fieldName) .map(TypeConversions::fromDataTypeToLegacyInfo); } /** * Returns the number of fields. */ public int getFieldCount() { return fieldNames.length; } /** * Returns all field names as an array. */ public String[] getFieldNames() { return fieldNames; } /** * Returns the specified name for the given field index. * * @param fieldIndex the index of the field */ public Optional getFieldName(int fieldIndex) { if (fieldIndex < 0 || fieldIndex >= fieldNames.length) { return Optional.empty(); } return Optional.of(fieldNames[fieldIndex]); } @Override public String toString() { final StringBuilder sb = new StringBuilder(); sb.append("root\n"); for (int i = 0; i < fieldNames.length; i++) { sb.append(" |-- ").append(fieldNames[i]).append(": ").append(fieldDataTypes[i]).append('\n'); } return sb.toString(); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } FlinkTableSchema schema = (FlinkTableSchema) o; return Arrays.equals(fieldNames, schema.fieldNames) && Arrays.equals(fieldDataTypes, schema.fieldDataTypes); } @Override public int hashCode() { int result = Arrays.hashCode(fieldNames); result = 31 * result + Arrays.hashCode(fieldDataTypes); return result; } } ================================================ FILE: fire-engines/fire-flink/src/main/java/com/zto/fire/flink/enu/DistributeModule.java ================================================ package com.zto.fire.flink.enu; import org.apache.commons.lang3.StringUtils; /** * 模块类型,用于标识不同的模块 * * @author ChengLong 2021-11-11 09:34:48 * @since 2.2.0 */ public enum DistributeModule { CONF("conf"), ARTHAS("arthas"); DistributeModule(String type) { } /** * 将字符串解析成指定的枚举类型 */ public static DistributeModule parse(String type) { if (StringUtils.isBlank(type)) { return CONF; } try { return Enum.valueOf(DistributeModule.class, type.trim().toUpperCase()); } catch (Exception e) { return CONF; } } } ================================================ FILE: fire-engines/fire-flink/src/main/java/com/zto/fire/flink/ext/watermark/FirePeriodicWatermarks.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.ext.watermark; import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks; import org.apache.flink.streaming.api.watermark.Watermark; /** * 基于AssignerWithPeriodicWatermarks的封装 * 有参的构造方法允许定义允许的最大乱序时间(单位ms) * maxTimestamp用于自定义水位线的时间,若不指定,则以系统当前时间为水位线值 * * @author ChengLong 2020-4-17 17:18:33 */ public abstract class FirePeriodicWatermarks implements AssignerWithPeriodicWatermarks { // 用于计算水位线值,若为0则取当前系统时间 protected long maxTimestamp = 0; // 允许最大的乱序时间,默认10s protected long maxOutOfOrder = 10 * 1000L; // 当前水位线的引用 protected transient Watermark watermark = new Watermark(System.currentTimeMillis()); protected FirePeriodicWatermarks() { } /** * 用于自定义允许最大的乱序时间 * * @param maxOutOfOrder 用户定义的最大乱序时间 */ protected FirePeriodicWatermarks(long maxOutOfOrder) { if (maxOutOfOrder > 0) { this.maxOutOfOrder = maxOutOfOrder; } } /** * 计算并返回当前的水位线 * 如果未指定水位线的时间戳,则默认获取当前系统时间 */ @Override public Watermark getCurrentWatermark() { if (this.maxTimestamp == 0) { this.watermark = new Watermark(System.currentTimeMillis() - this.maxOutOfOrder); } else { this.watermark = new Watermark(this.maxTimestamp - this.maxOutOfOrder); } return this.watermark; } } ================================================ FILE: fire-engines/fire-flink/src/main/java/com/zto/fire/flink/sink/BaseSink.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.sink import com.zto.fire.common.util.Logging import org.apache.flink.configuration.Configuration import org.apache.flink.runtime.state.{FunctionInitializationContext, FunctionSnapshotContext} import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction import org.apache.flink.streaming.api.functions.sink.{RichSinkFunction, SinkFunction} import java.util.concurrent._ import java.util.concurrent.atomic.AtomicBoolean import scala.util.control._ /** * fire框架基础的flink sink类 * 提供按批次、固定频率定时flush、checkpoint等功能 * JDBC sink、HBase sink可继承自此类,并实现自己的flush方法,完成数据的sink * * @param batch 每批大小,达到该阈值将批量sink到目标组件 * @param flushInterval 每隔多久刷新一次到目标组件(ms) * @author ChengLong * @since 1.1.0 * @create 2020-05-21 15:27 */ abstract class BaseSink[IN, OUT](batch: Int, flushInterval: Long) extends RichSinkFunction[IN] with CheckpointedFunction with Logging { protected var maxRetry: Long = 3 private var flushException: Exception = _ @transient protected var scheduler: ScheduledExecutorService = _ @transient protected var scheduledFuture: ScheduledFuture[_] = _ protected lazy val closed = new AtomicBoolean(false) @transient protected lazy val buffer = new CopyOnWriteArrayList[OUT]() /** * 初始化定时调度器,用于定时flush数据到目标组件 */ override def open(parameters: Configuration): Unit = { if (this.flushInterval > 0 && batch > 0) { this.scheduler = Executors.newScheduledThreadPool(1) if (this.scheduler != null) { this.scheduledFuture = this.scheduler.scheduleWithFixedDelay(new Runnable { override def run(): Unit = this.synchronized { if (closed.get()) return flush } }, this.flushInterval, this.flushInterval, TimeUnit.MILLISECONDS) } } } /** * 将数据sink到目标组件 * 不同的组件需定义该flush逻辑实现不同组件的flush操作 */ def sink: Unit = { // sink逻辑 } /** * 将数据构建成sink的格式 */ def map(value: IN): OUT /** * 关闭资源 * 1. 关闭定时flush线程池 * 2. 将缓冲区中的数据flush到目标组件 */ override def close(): Unit = { if (closed.get()) return closed.compareAndSet(false, true) this.checkFlushException if (this.scheduledFuture != null) { scheduledFuture.cancel(false) this.scheduler.shutdown() } if (this.buffer.size > 0) { this.flush } } /** * 将数据sink到缓冲区中 */ override def invoke(value: IN, context: SinkFunction.Context): Unit = { this.checkFlushException val out = this.map(value) if (out != null) this.buffer.add(out) if (this.buffer.size >= this.batch) { this.flush } } /** * 内部的flush,调用用户定义的flush方法 * 并清空缓冲区,将缓冲区大小归零 */ def flush: Unit = this.synchronized { this.checkFlushException if (this.buffer != null && this.buffer.size > 0) { this.logger.info(s"执行flushInternal操作 sink.size=${this.buffer.size()} batch=${this.batch} flushInterval=${this.flushInterval}") val loop = new Breaks loop.breakable { if (this.maxRetry < 1) this.maxRetry = 1 for (i <- 1L to this.maxRetry) { try { this.sink this.buffer.clear() loop.break } catch { case e: Exception => { this.logger.error(s"执行flushInternal操作失败,正在进行第${i}次重试。", e) if (i >= this.maxRetry) { this.flushException = e } Thread.sleep(1000 * i) } } } } } } /** * checkpoint时将数据全部flush */ override def snapshotState(context: FunctionSnapshotContext): Unit = { this.flush } override def initializeState(context: FunctionInitializationContext): Unit = { // initializeState } /** * 用于检测在flush过程中是否有异常,如果存在异常,则不再flush */ private def checkFlushException: Unit = { if (flushException != null) throw new RuntimeException(s"${this.getClass.getSimpleName} writing records failed.", flushException) } } ================================================ FILE: fire-engines/fire-flink/src/main/java/com/zto/fire/flink/sink/HBaseSink.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.sink import com.zto.fire._ import com.zto.fire.hbase.HBaseConnector import com.zto.fire.hbase.bean.HBaseBaseBean import com.zto.fire.hbase.conf.FireHBaseConf import scala.reflect.ClassTag /** * flink HBase sink组件,底层基于HBaseConnector * * @author ChengLong * @since 1.1.0 * @create 2020-5-25 16:06:15 */ abstract class HBaseSink[IN, T <: HBaseBaseBean[T] : ClassTag](tableName: String, batch: Int = 100, flushInterval: Long = 10000, keyNum: Int = 1) extends BaseSink[IN, T](batch, flushInterval) { // hbase操作失败时允许最大重试次数 this.maxRetry = FireHBaseConf.hbaseMaxRetry() /** * 将数据sink到hbase * 该方法会被flush方法自动调用 */ override def sink: Unit = { HBaseConnector.insert(this.tableName, this.buffer, this.keyNum) } } ================================================ FILE: fire-engines/fire-flink/src/main/java/com/zto/fire/flink/sink/JdbcSink.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.sink import com.zto.fire.predef._ import com.zto.fire.jdbc.JdbcConnector import com.zto.fire.jdbc.conf.FireJdbcConf /** * flink jdbc sink组件,底层基于JdbcConnector * * @author ChengLong * @since 1.1.0 * @create 2020-05-22 10:37 */ abstract class JdbcSink[IN](sql: String, batch: Int = 10, flushInterval: Long = 1000, keyNum: Int = 1) extends BaseSink[IN, Seq[Any]](batch, flushInterval) { // jdbc操作失败时允许最大重试次数 this.maxRetry = FireJdbcConf.maxRetry(keyNum) /** * 将数据sink到jdbc * 该方法会被flush方法自动调用 */ override def sink: Unit = { JdbcConnector.executeBatch(sql, this.buffer, keyNum = keyNum) } } ================================================ FILE: fire-engines/fire-flink/src/main/java/com/zto/fire/flink/task/FlinkSchedulerManager.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.task; import com.zto.fire.core.task.SchedulerManager; /** * Flink 定时调度任务管理器 * * @author ChengLong * @create 2020-12-18 17:20 * @since 1.0.0 */ public class FlinkSchedulerManager extends SchedulerManager { // 单例对象 private static SchedulerManager instance = null; static { instance = new FlinkSchedulerManager(); } private FlinkSchedulerManager() { } /** * 获取单例实例 */ public static SchedulerManager getInstance() { return instance; } @Override protected String label() { return DRIVER; } } ================================================ FILE: fire-engines/fire-flink/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory ================================================ com.zto.fire.flink.sql.connector.rocketmq.RocketMQDynamicTableFactory ================================================ FILE: fire-engines/fire-flink/src/main/resources/flink-batch.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # flink.fire.config_center.enable = false ================================================ FILE: fire-engines/fire-flink/src/main/resources/flink-streaming.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # checkpoint\u76F8\u5173\u914D\u7F6E # checkpoint\u9891\u7387\uFF0C-1\u8868\u793A\u5173\u95ED flink.stream.checkpoint.interval = -1 # EXACTLY_ONCE/AT_LEAST_ONCE flink.stream.checkpoint.mode = EXACTLY_ONCE # checkpoint\u8D85\u65F6\u65F6\u95F4\uFF0C\u5355\u4F4D\uFF1A\u6BEB\u79D2 flink.stream.checkpoint.timeout = 600000 # \u540C\u65F6checkpoint\u64CD\u4F5C\u7684\u5E76\u53D1\u6570 flink.stream.checkpoint.max.concurrent = 1 # \u4E24\u6B21checkpoint\u7684\u6700\u5C0F\u505C\u987F\u65F6\u95F4 flink.stream.checkpoint.min.pause.between = -1 # \u5982\u679C\u6709\u66F4\u8FD1\u7684checkpoint\u65F6\uFF0C\u662F\u5426\u5C06\u4F5C\u4E1A\u56DE\u9000\u5230\u8BE5\u68C0\u67E5\u70B9 flink.stream.checkpoint.prefer.recovery = false # \u53EF\u5BB9\u5FCDcheckpoint\u5931\u8D25\u7684\u6B21\u6570\uFF0C\u9ED8\u8BA4\u4E0D\u5141\u8BB8\u5931\u8D25 flink.stream.checkpoint.tolerable.failure.number = 0 # \u5F53cancel job\u65F6\u4FDD\u7559checkpoint flink.stream.checkpoint.externalized = RETAIN_ON_CANCELLATION # \u662F\u5426\u5F00\u542F\u975E\u5BF9\u9F50\u7684checkpoint flink.stream.checkpoint.unaligned.enable = false # \u8FD0\u884C\u6A21\u5F0F\uFF1ASTREAMING/BATCH/AUTOMATIC # flink.runtime.mode = STREAMING # \u6307\u5B9A\u6D88\u8D39rocketmq\u7684\u8D77\u59CB\u6D88\u8D39\u4F4D\u70B9 flink.rocket.starting.offsets = latest # \u9ED8\u8BA4KeyedState\u7684ttl\u65F6\u95F4\uFF08\u8C03\u7528\u7AEF\u53EF\u8986\u76D6\uFF09 flink.state.ttl.days = 31 # \u5F53\u72B6\u6001\u83B7\u53D6\u8017\u65F6\u8D85\u8FC7\u8BE5\u9608\u503C\u65F6\u5C06\u8BB0\u5F55\u65E5\u5FD7\uFF0C\u5C0F\u4E8E1\u8868\u793A\u4E0D\u8BB0\u5F55\u65E5\u5FD7 flink.state.log.threshold = 50 # \u5F53\u72B6\u6001\u83B7\u53D6\u8017\u65F6\u8D85\u8FC7\u8BE5\u9608\u503C\u65F6\u5C06\u8BB0\u5F55\u65E5\u5FD7\u7684\u65E5\u5FD7\u6761\u6570\uFF0C\u5C0F\u4E8E1\u8868\u793A\u4E0D\u9650\u884C\u6570 flink.state.log.threshold.max_count = 300000 # \u662F\u5426\u81EA\u52A8\u5C06insert\u8BED\u53E5\u52A0\u5165\u5230StatementSet\u4E2D(StatementSet.addInsertSql) flink.sql.auto.add.statementSet = true # flink\u672C\u5730\u78C1\u76D8\u72B6\u6001\u8DEF\u5F84\u9009\u62E9\u7B56\u7565\uFF1Adefault/round_robin\uFF0C\u9700\u8981\u4E0Eflink.state.external.zookeeper\u53C2\u6570\u914D\u5408\u4F7F\u7528 flink.state.choose.disk.policy = default # \u662F\u5426\u542F\u7528operatorChaining\uFF0C\u5982\u679C\u662Fdebug\u60C5\u51B5\u4E0B\uFF0C\u53EF\u7F6E\u4E3Afalse\uFF0C\u4FBF\u4E8E\u6392\u67E5\u95EE\u9898 flink.env.operatorChaining.enable = true ================================================ FILE: fire-engines/fire-flink/src/main/resources/flink.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # ----------------------------------------------- < fire \u914D\u7F6E > ------------------------------------------------ # # fire\u6846\u67B6rest\u63A5\u53E3\u670D\u52A1\u6700\u5927\u7EBF\u7A0B\u6570 fire.restful.max.thread = 10 # \u662F\u5426\u542F\u7528fire\u7684\u5206\u5E03\u5F0F\u6570\u636E\u540C\u6B65\u529F\u80FD\uFF0C\u5F00\u542F\u540E\u53EF\u5C06JobManager\u7AEF\u7684\u6570\u636E\u5206\u5E03\u5F0F\u540C\u6B65\u5230\u6BCF\u4E00\u4E2ATaskManager\u7AEF fire.distribute.sync.enable = true # flink \u5F15\u64CE arthas\u542F\u52A8\u5668\u7C7B\u540D fire.analysis.arthas.launcher = com.zto.fire.flink.plugin.FlinkArthasLauncher # \u5F53\u6267\u884Cshutdown\u65F6\u662F\u5426\u8C03\u7528System.exit fire.shutdown.auto.exit = true # \u4E3B\u952E\u914D\u7F6E\u6620\u5C04\u7BA1\u7406\u7C7B fire.conf.anno.manager.class = com.zto.fire.flink.conf.FlinkAnnoManager # ----------------------------------------------- < flink \u914D\u7F6E > ----------------------------------------------- # # flink\u7684\u5E94\u7528\u540D\u79F0\uFF0C\u4E3A\u7A7A\u5219\u53D6\u7C7B\u540D flink.appName = # kafka\u7684groupid\uFF0C\u4E3A\u7A7A\u5219\u53D6\u7C7B\u540D flink.kafka.group.id = # bigdata\u8868\u793A\u8FDE\u63A5\u5927\u6570\u636E\u7684kafka\uFF0Czms\u8868\u793A\u8FDE\u63A5zms\u7684kafka\u96C6\u7FA4 flink.kafka.brokers.name = # topic\u5217\u8868 flink.kafka.topics = # \u7528\u4E8E\u914D\u7F6E\u542F\u52A8\u65F6\u7684\u6D88\u8D39\u4F4D\u70B9\uFF0C\u9ED8\u8BA4\u53D6\u6700\u65B0 flink.kafka.starting.offsets = # \u6570\u636E\u4E22\u5931\u65F6\u6267\u884C\u5931\u8D25 flink.kafka.failOnDataLoss = true # \u662F\u5426\u542F\u7528\u81EA\u52A8commit flink.kafka.enable.auto.commit = false # \u662F\u5426\u5728checkpoint\u65F6\u8BB0\u5F55offset\u503C flink.kafka.CommitOffsetsOnCheckpoints = true # \u8BBE\u7F6E\u4ECE\u6307\u5B9A\u65F6\u95F4\u6233\u4F4D\u7F6E\u5F00\u59CB\u6D88\u8D39kafka flink.kafka.StartFromTimestamp = 0 # \u4ECEtopic\u4E2D\u6307\u5B9A\u7684group\u4E0A\u6B21\u6D88\u8D39\u7684\u4F4D\u7F6E\u5F00\u59CB\u6D88\u8D39\uFF0C\u5FC5\u987B\u914D\u7F6Egroup.id\u53C2\u6570 flink.kafka.StartFromGroupOffsets = false # \u662F\u5426\u8986\u76D6\u72B6\u6001\u4E2D\u7684offset\uFF08\u8BF7\u8C28\u614E\u914D\u7F6E\uFF0C\u7528\u4E8Ekafka\u96C6\u7FA4\u8FC1\u79FB\u7B49\u4E0D\u6B63\u5E38\u72B6\u51B5\u7684\u8FD0\u7EF4\uFF09 flink.kafka.force.overwrite.stateOffset.enable = false # \u662F\u5426\u5728\u5F00\u542Fcheckpoint\u7684\u60C5\u51B5\u4E0B\u5F3A\u5236\u5F00\u542F\u5468\u671F\u6027offset\u63D0\u4EA4 flink.kafka.force.autoCommit.enable = false # \u5468\u671F\u6027\u63D0\u4EA4offset\u7684\u65F6\u95F4\u95F4\u9694\uFF08ms\uFF09 flink.kafka.force.autoCommit.Interval = 30000 # flink.kafka.conf\u5F00\u5934\u7684\u914D\u7F6E\u652F\u6301\u6240\u6709kafka client\u7684\u914D\u7F6E #flink.kafka.conf.session.timeout.ms = 300000 #flink.kafka.conf.request.timeout.ms = 400000 # \u9ED8\u8BA4\u7684\u65E5\u5FD7\u7EA7\u522B flink.log.level = WARN # flink sql\u914D\u7F6E\u9879\uFF0C\u4EE5flink.sql.conf.\u5F00\u5934\u5C06\u4F1A\u88AB\u81EA\u52A8\u52A0\u8F7D #flink.sql.conf.table.exec.mini-batch.enabled = false #flink.sql.conf.table.exec.state.ttl = 0 ms # flink sql udf\u6CE8\u518C\uFF0C\u4EE5flink.sql.udf.\u5F00\u5934\uFF0C\u4EE5\u4E0B\u914D\u7F6E\u7684\u542B\u4E49\u662F\uFF1ACREATE FUNCTION fireUdf AS 'com.zto.fire.examples.flink.stream.Udf' flink.sql.udf.fireUdf = com.zto.fire.examples.flink.stream.Udf flink.sql.udf.fireUdf.enable = false # \u6307\u5B9A\u5728flink\u5F15\u64CE\u4E0B\uFF0C\u53EF\u8FDB\u884C\u914D\u7F6E\u540C\u6B65\u7684\u5B50\u7C7B\u5B9E\u73B0 flink.fire.conf.deploy.engine = com.zto.fire.flink.sync.SyncFlinkEngine # \u662F\u5426\u6253\u5370\u7EC4\u88C5with\u8BED\u53E5\u540E\u7684flink sql\uFF0C\u7531\u4E8Ewith\u8868\u8FBE\u5F0F\u4E2D\u53EF\u80FD\u542B\u6709\u654F\u611F\u4FE1\u606F\uFF0C\u9ED8\u8BA4\u4E3A\u5173\u95ED flink.sql.log.enable = false # \u662F\u5426\u542F\u7528\u914D\u7F6E\u6587\u4EF6\u4E2Dwith\u5F3A\u5236\u66FF\u6362sql\u4E2D\u5DF2\u6709\u7684with\u8868\u8FBE\u5F0F\uFF0C\u5982\u679C\u542F\u7528\uFF0C\u5E76\u4E14\u914D\u7F6E\u6587\u4EF6\u4E2D\u6709\u6307\u5B9Awith\u914D\u7F6E\u4FE1\u606F\uFF0C\u5219\u4F1A\u5F3A\u5236\u66FF\u6362\u6389\u4EE3\u7801\u4E2Dsql\u7684with\u5217\u8868 flink.sql_with.replaceMode.enable = true # \u5F00\u542Fwebui\u706B\u7130\u56FE rest.flamegraph.enabled = true # ----------------------------------------------- < hive \u914D\u7F6E > ----------------------------------------------- # # hive \u96C6\u7FA4\u540D\u79F0\uFF08batch\u79BB\u7EBFhive/streaming 180\u96C6\u7FA4hive/test\u672C\u5730\u6D4B\u8BD5hive\uFF09\uFF0C\u7528\u4E8Eflink\u8DE8\u96C6\u7FA4\u8BFB\u53D6hive\u5143\u6570\u636E\u4FE1\u606F flink.hive.cluster = # flink\u6240\u96C6\u6210\u7684hive\u7248\u672C\u53F7 flink.hive.version = 1.2.0 # \u9ED8\u8BA4\u7684hive\u6570\u636E\u5E93 fire.hive.default.database.name = tmp # \u9ED8\u8BA4\u7684hive\u5206\u533A\u5B57\u6BB5\u540D\u79F0 flink.default.table.partition.name = ds # hive\u7684catalog\u540D\u79F0 flink.hive.catalog.name = hive # ----------------------------------------------- < HBase \u914D\u7F6E > ----------------------------------------------- # # \u7528\u4E8E\u533A\u5206\u4E0D\u540C\u7684hbase\u96C6\u7FA4: batch/streaming/old flink.hbase.cluster = batch # \u4E00\u6B21\u8BFB\u5199HBase\u7684\u6570\u636E\u91CF flink.hbase.batch.size = 10000 # ----------------------------------------------- < flink \u53C2\u6570 > ----------------------------------------------- # # flink\u76F8\u5173\u4F18\u5316\u53C2\u6570\u5217\u5728\u4E0B\u9762\u4F1A\u81EA\u52A8\u88ABfire\u52A0\u8F7D\u751F\u6548 flink.auto.generate.uid.enable = true flink.auto.type.registration.enable = true flink.force.avro.enable = false flink.force.kryo.enable = false flink.generic.types.enable = true flink.object.reuse.enable = false flink.auto.watermark.interval = -1 # \u9ED8\u8BA4\u503C\u4E3A\uFF1ARECURSIVE\uFF0C\u5305\u62EC\uFF1ARECURSIVE\u3001NONE\u3001TOP_LEVEL flink.closure.cleaner.level = recursive flink.default.input.dependency.constraint = any # \u9ED8\u8BA4\u503C\uFF1APIPELINED\uFF0C\u5305\u62EC\uFF1APIPELINED\u3001PIPELINED_FORCED\u3001BATCH\u3001BATCH_FORCED flink.execution.mode = pipelined flink.latency.tracking.interval = flink.max.parallelism = 10240 flink.default.parallelism = flink.task.cancellation.interval = flink.task.cancellation.timeout.millis = flink.use.snapshot.compression = false flink.stream.buffer.timeout.millis = flink.stream.number.execution.retries = flink.stream.time.characteristic = # \u662F\u5426\u5C06\u914D\u7F6E\u540C\u6B65\u5230taskmanager\u7AEF flink.fire.deploy_conf.enable = false # \u9ED8\u8BA4\u7684flink default catalog\u540D\u79F0 flink.sql.default.catalog.name = default_catalog ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/BaseFlink.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink import com.zto.fire._ import com.zto.fire.common.conf.{FireFrameworkConf, FireHDFSConf, FireHiveConf} import com.zto.fire.common.util.{OSUtils, PropUtils} import com.zto.fire.core.BaseFire import com.zto.fire.core.rest.RestServerManager import com.zto.fire.flink.conf.FireFlinkConf import com.zto.fire.flink.rest.FlinkSystemRestful import com.zto.fire.flink.task.{FlinkInternalTask, FlinkSchedulerManager} import com.zto.fire.flink.util.{FlinkSingletonFactory, FlinkUtils} import org.apache.commons.lang3.StringUtils import org.apache.flink.api.common.ExecutionConfig import org.apache.flink.api.java.utils.ParameterTool import org.apache.flink.api.scala.ExecutionEnvironment import org.apache.flink.configuration.{Configuration, GlobalConfiguration} import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic} import org.apache.flink.table.catalog.hive.HiveCatalog import org.apache.hadoop.hive.conf.HiveConf import scala.util.Try /** * Flink引擎通用父接口 * * @author ChengLong 2020年1月7日 09:31:09 */ trait BaseFlink extends BaseFire { protected[fire] var _conf: Configuration = _ protected var hiveCatalog: HiveCatalog = _ protected var parameter: ParameterTool = _ /** * 生命周期方法:初始化fire框架必要的信息 * 注:该方法会同时在driver端与executor端执行 */ override private[fire] def boot: Unit = { PropUtils.load(FireFrameworkConf.FLINK_CONF_FILE) // flink引擎无需主动在父类中主动加载配置信息,配置加载在GlobalConfiguration中完成 if (OSUtils.isLocal || FireFrameworkConf.localEnv) { this.loadConf PropUtils.load(FireFrameworkConf.userCommonConf: _*).loadJobConf(this.getClass.getName) } PropUtils.setProperty(FireFlinkConf.FLINK_DRIVER_CLASS_NAME, this.className) PropUtils.setProperty(FireFlinkConf.FLINK_CLIENT_SIMPLE_CLASS_NAME, this.driverClass) FlinkSingletonFactory.setAppName(this.appName) super.boot } /** * 初始化flink运行时环境 */ override private[fire] def createContext(conf: Any): Unit = { if (FlinkUtils.isYarnApplicationMode) { // fire rest 服务仅支持flink的yarn-application模式 this.restfulRegister = new RestServerManager().startRestPort(GlobalConfiguration.getRestPortAndClose) this.systemRestful = new FlinkSystemRestful(this, this.restfulRegister) } PropUtils.show() FlinkSchedulerManager.getInstance().registerTasks(this, new FlinkInternalTask(this)) // 创建HiveCatalog val metastore = FireHiveConf.getMetastoreUrl if (StringUtils.isNotBlank(metastore)) { val hiveConf = new HiveConf() hiveConf.setVar(HiveConf.ConfVars.METASTOREURIS, metastore) // 根据所选的hive,进行对应hdfs的HA参数设置 FireHDFSConf.hdfsHAConf.foreach(prop => hiveConf.set(prop._1, prop._2)) this.hiveCatalog = new HiveCatalog(FireHiveConf.hiveCatalogName, FireHiveConf.defaultDB, hiveConf, FireHiveConf.hiveVersion) this.logger.info(s"enabled flink-hive support. catalogName is ${FireHiveConf.hiveCatalogName}") } } /** * 构建或合并Configuration * 注:不同的子类需根据需要复写该方法 * * @param conf * 在conf基础上构建 * @return * 合并后的Configuration对象 */ def buildConf(conf: Configuration): Configuration /** * 生命周期方法:用于回收资源 */ override def stop: Unit = { try { this.after() } finally { this.shutdown() } } /** * 生命周期方法:进行fire框架的资源回收 * 注:不允许子类覆盖 */ override protected[fire] final def shutdown(stopGracefully: Boolean = true, inListener: Boolean = false): Unit = { super.shutdown(stopGracefully, inListener) if (FireFrameworkConf.shutdownExit) System.exit(0) } /** * 用于解析configuration中的配置,识别flink参数(非用户自定义参数),并设置到env中 */ private[fire] def configParse(env: Any): ExecutionConfig = { requireNonEmpty(env)("Environment对象不能为空") val config = if (env.isInstanceOf[ExecutionEnvironment]) { val batchEnv = env.asInstanceOf[ExecutionEnvironment] // flink.default.parallelism if (FireFlinkConf.defaultParallelism != -1) batchEnv.setParallelism(FireFlinkConf.defaultParallelism) batchEnv.getConfig } else { val streamEnv = env.asInstanceOf[StreamExecutionEnvironment] // flink.max.parallelism if (FireFlinkConf.maxParallelism != -1) streamEnv.setMaxParallelism(FireFlinkConf.maxParallelism) // flink.default.parallelism if (FireFlinkConf.defaultParallelism != -1) streamEnv.setParallelism(FireFlinkConf.defaultParallelism) // flink.stream.buffer.timeout.millis if (FireFlinkConf.streamBufferTimeoutMillis != -1) streamEnv.setBufferTimeout(FireFlinkConf.streamBufferTimeoutMillis) // flink.stream.number.execution.retries if (FireFlinkConf.streamNumberExecutionRetries != -1) streamEnv.setNumberOfExecutionRetries(FireFlinkConf.streamNumberExecutionRetries) // flink.stream.time.characteristic if (StringUtils.isNotBlank(FireFlinkConf.streamTimeCharacteristic)) streamEnv.setStreamTimeCharacteristic(TimeCharacteristic.valueOf(FireFlinkConf.streamTimeCharacteristic)) // checkPoint相关参数 val ckConfig = streamEnv.getCheckpointConfig if (ckConfig != null && FireFlinkConf.streamCheckpointInterval != -1) { // flink.stream.checkpoint.interval 单位:毫秒 默认:-1 关闭 streamEnv.enableCheckpointing(FireFlinkConf.streamCheckpointInterval) // flink.stream.checkpoint.mode EXACTLY_ONCE/AT_LEAST_ONCE 默认:EXACTLY_ONCE if (StringUtils.isNotBlank(FireFlinkConf.streamCheckpointMode)) ckConfig.setCheckpointingMode(CheckpointingMode.valueOf(FireFlinkConf.streamCheckpointMode.trim.toUpperCase)) // flink.stream.checkpoint.timeout 单位:毫秒 默认:10 * 60 * 1000 if (FireFlinkConf.streamCheckpointTimeout > 0) ckConfig.setCheckpointTimeout(FireFlinkConf.streamCheckpointTimeout) // flink.stream.checkpoint.max.concurrent 默认:1 if (FireFlinkConf.streamCheckpointMaxConcurrent > 0) ckConfig.setMaxConcurrentCheckpoints(FireFlinkConf.streamCheckpointMaxConcurrent) // flink.stream.checkpoint.min.pause.between 默认:-1 if (FireFlinkConf.streamCheckpointMinPauseBetween >= 0) { ckConfig.setMinPauseBetweenCheckpoints(FireFlinkConf.streamCheckpointMinPauseBetween) } else { // 如果flink.stream.checkpoint.min.pause.between=-1,则默认的checkpoint间隔时间是checkpoint的频率 ckConfig.setMinPauseBetweenCheckpoints(FireFlinkConf.streamCheckpointInterval) } // flink.stream.checkpoint.prefer.recovery 默认:false // ckConfig.setPreferCheckpointForRecovery(FireFlinkConf.streamCheckpointPreferRecovery) // flink.stream.checkpoint.tolerable.failure.number 默认:0 if (FireFlinkConf.streamCheckpointTolerableFailureNumber >= 0) ckConfig.setTolerableCheckpointFailureNumber(FireFlinkConf.streamCheckpointTolerableFailureNumber) // flink.stream.checkpoint.externalized if (StringUtils.isNotBlank(FireFlinkConf.streamCheckpointExternalized)) ckConfig.enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.valueOf(FireFlinkConf.streamCheckpointExternalized.trim)) // flink.stream.checkpoint.unaligned.enable ckConfig.enableUnalignedCheckpoints(FireFlinkConf.unalignedCheckpointEnable) } streamEnv.getConfig } FlinkUtils.parseConf(config) config } /** * 获取任务的resourceId * * @return * spark任务:driver/id flink任务:JobManager/container_xxx */ override protected def resourceId: String = FlinkUtils.getResourceId /** * SQL语法校验,如果语法错误,则返回错误堆栈 * * @param sql * sql statement */ override def sqlValidate(sql: JString): Try[Unit] = FlinkUtils.sqlValidate(sql) /** * SQL语法校验 * @param sql * sql statement * @return * true:校验成功 false:校验失败 */ override def sqlLegal(sql: JString): Boolean = FlinkUtils.sqlLegal(sql) /** * 初始化引擎上下文,如SparkSession、StreamExecutionEnvironment等 * 可根据实际情况,将配置参数放到同名的配置文件中进行差异化的初始化 */ override def main(args: Array[String]): Unit = { try { if (args != null && args.nonEmpty) this.parameter = ParameterTool.fromArgs(args) } catch { case _: Throwable => this.logger.error("ParameterTool 解析main方法参数失败,请注意参数的key必须以-或--开头") } finally { this.init(null, args) } } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/BaseFlinkBatch.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink import com.zto.fire.common.conf.{FireFrameworkConf, FireHiveConf} import com.zto.fire.common.enu.JobType import com.zto.fire.common.util.{OSUtils, PropUtils} import com.zto.fire.flink.util.FlinkSingletonFactory import org.apache.commons.lang3.StringUtils import org.apache.flink.api.java.utils.ParameterTool import org.apache.flink.api.scala.ExecutionEnvironment import org.apache.flink.configuration.{ConfigConstants, Configuration} import org.apache.flink.table.api.{EnvironmentSettings, TableEnvironment} /** * flink batch通用父接口 * @author ChengLong 2020年1月7日 15:15:56 */ trait BaseFlinkBatch extends BaseFlink { override val jobType: JobType = JobType.FLINK_BATCH protected var env, flink, fire: ExecutionEnvironment = _ protected var tableEnv: TableEnvironment = _ /** * 构建或合并Configuration * 注:不同的子类需根据需要复写该方法 * * @param conf * 在conf基础上构建 * @return * 合并后的Configuration对象 */ override def buildConf(conf: Configuration): Configuration = { val finalConf = if (conf != null) conf else { val tmpConf = new Configuration() PropUtils.settings.foreach(t => tmpConf.setString(t._1, t._2)) tmpConf } finalConf.setBoolean(ConfigConstants.LOCAL_START_WEBSERVER, true) this._conf = finalConf finalConf } /** * 程序初始化方法,用于初始化必要的值 * * @param conf * 用户指定的配置信息 * @param args * main方法参数列表 */ override def init(conf: Any = null, args: Array[String] = null): Unit = { super.init(conf, args) if (conf != null) conf.asInstanceOf[Configuration].setBoolean(ConfigConstants.LOCAL_START_WEBSERVER, true) this.processAll } /** * 创建计算引擎运行时环境 * * @param conf * 配置信息 */ override private[fire] def createContext(conf: Any): Unit = { super.createContext(conf) val finalConf = this.buildConf(conf.asInstanceOf[Configuration]) if (OSUtils.isLocal) { this.env = ExecutionEnvironment.createLocalEnvironmentWithWebUI(finalConf) } else { this.env = ExecutionEnvironment.getExecutionEnvironment } this.env.getConfig.setGlobalJobParameters(ParameterTool.fromMap(finalConf.toMap)) this.configParse(this.env) this.tableEnv = TableEnvironment.create(EnvironmentSettings.newInstance.inBatchMode().build()) if (StringUtils.isNotBlank(FireHiveConf.getHiveConfDir)) { this.tableEnv.registerCatalog(FireHiveConf.hiveCatalogName, this.hiveCatalog) } this.flink = this.env this.fire = this.flink FlinkSingletonFactory.setEnv(this.env).setTableEnv(this.tableEnv) } /** * 在加载任务配置文件前将被加载 */ override private[fire] def loadConf(): Unit = { // 加载配置文件 PropUtils.load(FireFrameworkConf.FLINK_BATCH_CONF_FILE) } /** * 生命周期方法:具体的用户开发的业务逻辑代码 * 注:此方法会被自动调用,不需要在main中手动调用 */ override def process: Unit = { // 子类复写该方法实现业务处理逻辑 } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/BaseFlinkCore.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink /** * flink batch通用父接口 * * @author ChengLong 2020年1月7日 15:15:56 */ trait BaseFlinkCore extends BaseFlinkBatch { } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/BaseFlinkStreaming.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink import com.zto.fire._ import com.zto.fire.common.conf.{FireFrameworkConf, FireHiveConf} import com.zto.fire.common.enu.JobType import com.zto.fire.common.util.{OSUtils, PropUtils} import com.zto.fire.flink.conf.FireFlinkConf import com.zto.fire.flink.util.{FlinkSingletonFactory, FlinkUtils} import org.apache.commons.lang3.StringUtils import org.apache.flink.api.common.RuntimeExecutionMode import org.apache.flink.api.java.utils.ParameterTool import org.apache.flink.api.scala._ import org.apache.flink.configuration.{ConfigConstants, Configuration} import org.apache.flink.streaming.api.scala.{OutputTag, StreamExecutionEnvironment} import org.apache.flink.table.api.{EnvironmentSettings, TableEnvironment} import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment import org.apache.flink.table.functions.ScalarFunction /** * flink streaming通用父接口 * * @author ChengLong 2020年1月7日 10:50:19 */ trait BaseFlinkStreaming extends BaseFlink { protected var env, senv, flink, fire: StreamExecutionEnvironment = _ protected var tableEnv: TableEnvironment = _ protected lazy val sql = fire.sql(_) protected lazy val sqlQuery = fire.sqlQuery(_) protected lazy val steamTableEnv: StreamTableEnvironment = this.tableEnv.asInstanceOf[StreamTableEnvironment] override val jobType: JobType = JobType.FLINK_STREAMING // 用于存放延期的数据 protected lazy val outputTag = new OutputTag[Any]("later_data") /** * 构建或合并Configuration * 注:不同的子类需根据需要复写该方法 * * @param conf * 在conf基础上构建 * @return * 合并后的Configuration对象 */ override def buildConf(conf: Configuration): Configuration = { val finalConf = if (conf != null) conf else { val tmpConf = new Configuration() PropUtils.settings.foreach(t => tmpConf.setString(t._1, t._2)) tmpConf } finalConf.setBoolean(ConfigConstants.LOCAL_START_WEBSERVER, true) this._conf = finalConf finalConf } /** * 程序初始化方法,用于初始化必要的值 * * @param conf * 用户指定的配置信息 * @param args * main方法参数列表 */ override def init(conf: Any = null, args: Array[String] = null): Unit = { super.init(conf, args) this.processAll if (FireFrameworkConf.jobAutoStart) this.fire.start } /** * 初始化flink运行时环境 */ override def createContext(conf: Any): Unit = { super.createContext(conf) if (FlinkUtils.isYarnApplicationMode) this.restfulRegister.startRestServer val finalConf = this.buildConf(conf.asInstanceOf[Configuration]) if (OSUtils.isLocal) { this.env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(finalConf) } else { this.env = StreamExecutionEnvironment.getExecutionEnvironment } val runtimeExecutionMode = RuntimeExecutionMode.valueOf(FireFlinkConf.flinkRuntimeMode) this.env.setRuntimeMode(runtimeExecutionMode) this.env.getConfig.setGlobalJobParameters(ParameterTool.fromMap(finalConf.toMap)) if (!FireFlinkConf.operatorChainingEnable) this.env.disableOperatorChaining() this.configParse(this.env) this.senv = this.env val builder = EnvironmentSettings.newInstance this.tableEnv = if (runtimeExecutionMode == RuntimeExecutionMode.BATCH) { TableEnvironment.create(builder.inBatchMode().build()) } else { StreamTableEnvironment.create(this.env, builder.inStreamingMode().build()) } val tableConfig = this.tableEnv.getConfig.getConfiguration FireFlinkConf.flinkSqlConfig.filter(kv => noEmpty(kv, kv._1, kv._2)).foreach(kv => tableConfig.setString(kv._1, kv._2)) if (StringUtils.isNotBlank(FireHiveConf.getMetastoreUrl)) { this.tableEnv.registerCatalog(FireHiveConf.hiveCatalogName, this.hiveCatalog) } this.flink = this.env this.fire = this.flink FlinkSingletonFactory.setStreamEnv(this.env).setTableEnv(this.tableEnv) FlinkUtils.loadUdfJar // 自动注册配置文件中指定的udf函数 if (FireFlinkConf.flinkUdfEnable) { FireFlinkConf.flinkUdfList.filter(udf => noEmpty(udf, udf._1, udf._2)).foreach(udf => { this.logger.info(s"register udf function [ ${udf._1} ] with class [ ${udf._2} ].") this.tableEnv.createTemporarySystemFunction(udf._1, Class.forName(udf._2).asInstanceOf[Class[ScalarFunction]]) }) } } /** * 在加载任务配置文件前将被加载 */ override private[fire] def loadConf(): Unit = { // 加载配置文件 PropUtils.load(FireFrameworkConf.FLINK_STREAMING_CONF_FILE) } /** * 生命周期方法:具体的用户开发的业务逻辑代码 * 注:此方法会被自动调用,不需要在main中手动调用 */ override def process: Unit = { // 子类复写该方法实现业务处理逻辑 } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/FlinkBatch.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink /** * flink batch通用父接口 * @author ChengLong 2020年1月7日 15:15:56 */ trait FlinkBatch extends BaseFlinkBatch { } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/FlinkCore.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink /** * flink batch通用父接口 * @author ChengLong 2020年1月7日 15:15:56 */ trait FlinkCore extends BaseFlinkBatch { } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/FlinkStreaming.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink /** * flink streaming通用父接口 * * @author ChengLong 2020年1月7日 10:52:19 */ trait FlinkStreaming extends BaseFlinkStreaming { } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/acc/MultiCounterAccumulator.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.acc import java.util.concurrent.ConcurrentHashMap import com.zto.fire.predef._ import org.apache.flink.api.common.accumulators.{Accumulator, SimpleAccumulator} /** * flink 自定义多值累加器 * * @author ChengLong 2020年1月11日 13:58:15 * @since 0.4.1 */ private[fire] class MultiCounterAccumulator extends SimpleAccumulator[ConcurrentHashMap[String, Long]] { private[fire] val multiCounter = new ConcurrentHashMap[String, Long]() /** * 向累加器中添加新的值 * * @param value */ override def add(value: ConcurrentHashMap[String, Long]): Unit = { this.mergeMap(value) } /** * 添加一个值到累加器中 */ def add(kv: (String, Long)): Unit = { if (kv != null) { this.multiCounter.put(kv._1, this.multiCounter.getOrDefault(kv._1, 0) + kv._2) } } /** * 获取当前本地的累加器中的值 * * @return * 当前jvm中的累加器值,非全局 */ override def getLocalValue: ConcurrentHashMap[String, Long] = { this.multiCounter } /** * 清空当前本地的累加值 */ override def resetLocal(): Unit = { this.multiCounter.clear() } /** * 合并两个累加器中的值 */ override def merge(other: Accumulator[ConcurrentHashMap[String, Long], ConcurrentHashMap[String, Long]]): Unit = { this.mergeMap(other.getLocalValue) } /** * 用于合并数据到累加器的map中 * 存在的累加,不存在的直接添加 */ private[this] def mergeMap(value: ConcurrentHashMap[String, Long]): Unit = { if (noEmpty(value)) { value.foreach(kv => { this.multiCounter.put(kv._1, this.multiCounter.getOrDefault(kv._1, 0) + kv._2) }) } } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/conf/FireFlinkConf.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.conf import com.zto.fire.common.util.PropUtils /** * flink相关配置 * * @author ChengLong * @since 1.1.0 * @create 2020-07-13 14:55 */ private[fire] object FireFlinkConf { lazy val FLINK_AUTO_GENERATE_UID_ENABLE = "flink.auto.generate.uid.enable" lazy val FLINK_AUTO_TYPE_REGISTRATION_ENABLE = "flink.auto.type.registration.enable" lazy val FLINK_FORCE_AVRO_ENABLE = "flink.force.avro.enable" lazy val FLINK_FORCE_KRYO_ENABLE = "flink.force.kryo.enable" lazy val FLINK_GENERIC_TYPES_ENABLE = "flink.generic.types.enable" lazy val FLINK_OBJECT_REUSE_ENABLE = "flink.object.reuse.enable" lazy val FLINK_AUTO_WATERMARK_INTERVAL = "flink.auto.watermark.interval" lazy val FLINK_CLOSURE_CLEANER_LEVEL = "flink.closure.cleaner.level" lazy val FLINK_DEFAULT_INPUT_DEPENDENCY_CONSTRAINT = "flink.default.input.dependency.constraint" lazy val FLINK_EXECUTION_MODE = "flink.execution.mode" lazy val FLINK_RUNTIME_MODE = "flink.runtime.mode" lazy val FLINK_LATENCY_TRACKING_INTERVAL = "flink.latency.tracking.interval" lazy val FLINK_MAX_PARALLELISM = "flink.max.parallelism" lazy val FLINK_DEFAULT_PARALLELISM = "flink.default.parallelism" lazy val FLINK_TASK_CANCELLATION_INTERVAL = "flink.task.cancellation.interval" lazy val FLINK_TASK_CANCELLATION_TIMEOUT_MILLIS = "flink.task.cancellation.timeout.millis" lazy val FLINK_USE_SNAPSHOT_COMPRESSION = "flink.use.snapshot.compression" lazy val FLINK_STREAM_BUFFER_TIMEOUT_MILLIS = "flink.stream.buffer.timeout.millis" lazy val FLINK_STREAM_NUMBER_EXECUTION_RETRIES = "flink.stream.number.execution.retries" lazy val FLINK_STREAM_TIME_CHARACTERISTIC = "flink.stream.time.characteristic" lazy val FLINK_DRIVER_CLASS_NAME = "flink.driver.class.name" lazy val FLINK_CLIENT_SIMPLE_CLASS_NAME = "flink.client.simple.class.name" lazy val FLINK_SQL_CONF_UDF_JARS = "flink.sql.conf.pipeline.jars" lazy val FLINK_SQL_LOG_ENABLE = "flink.sql.log.enable" lazy val FLINK_SQL_DEFAULT_CATALOG_NAME = "flink.sql.default.catalog.name" lazy val FLINK_STATE_TTL_DAYS = "flink.state.ttl.days" lazy val DISTRIBUTE_SYNC_ENABLE = "fire.distribute.sync.enable" lazy val OPERATOR_CHAINING_ENABLE = "flink.env.operatorChaining.enable" // checkpoint相关配置项 lazy val FLINK_STREAM_CHECKPOINT_INTERVAL = "flink.stream.checkpoint.interval" lazy val FLINK_STREAM_CHECKPOINT_MODE = "flink.stream.checkpoint.mode" lazy val FLINK_STREAM_CHECKPOINT_TIMEOUT = "flink.stream.checkpoint.timeout" lazy val FLINK_STREAM_CHECKPOINT_MAX_CONCURRENT = "flink.stream.checkpoint.max.concurrent" lazy val FLINK_STREAM_CHECKPOINT_MIN_PAUSE_BETWEEN = "flink.stream.checkpoint.min.pause.between" lazy val FLINK_STREAM_CHECKPOINT_PREFER_RECOVERY = "flink.stream.checkpoint.prefer.recovery" lazy val FLINK_STREAM_CHECKPOINT_TOLERABLE_FAILURE_NUMBER = "flink.stream.checkpoint.tolerable.failure.number" lazy val FLINK_STREAM_CHECKPOINT_EXTERNALIZED = "flink.stream.checkpoint.externalized" lazy val FLINK_STREAM_CHECKPOINT_UNALIGNED = "flink.stream.checkpoint.unaligned.enable" lazy val FLINK_SQL_WITH_REPLACE_MODE_ENABLE = "flink.sql_with.replaceMode.enable" lazy val FLINK_STATE_CLEAN_HDFS_URL = "flink.state.clean.hdfs.url" // flink sql相关配置 lazy val FLINK_SQL_CONF_PREFIX = "flink.sql.conf." // udf自动注册 lazy val FLINK_SQL_UDF_CONF_PREFIX = "flink.sql.udf.conf." lazy val FLINK_SQL_UDF_ENABLE = "flink.sql.udf.fireUdf.enable" /** * 获取所有flink.sql.with.为前缀的配置信息如: * flink.sql.with.bill_db.connector = mysql * flink.sql.with.bill_db.url = jdbc:mysql://localhost:3306/fire * 上述配置标识定义名为bill_db的数据源,配置了两个options选项分别为: * connector = mysql * url = jdbc:mysql://localhost:3306/fire * sql中即可通过 'datasource'='bill_db' 引用到上述两项option */ lazy val FLINK_SQL_WITH_PREFIX = "flink.sql.with." lazy val FLINK_SQL_AUTO_ADD_STATEMENT_SET = "flink.sql.auto.add.statementSet" lazy val defaultCatalogName = PropUtils.getString(this.FLINK_SQL_DEFAULT_CATALOG_NAME, "default_catalog") lazy val sqlWithReplaceModeEnable = PropUtils.getBoolean(this.FLINK_SQL_WITH_REPLACE_MODE_ENABLE, true) lazy val autoGenerateUidEnable = PropUtils.getBoolean(this.FLINK_AUTO_GENERATE_UID_ENABLE, true) lazy val autoTypeRegistrationEnable = PropUtils.getBoolean(this.FLINK_AUTO_TYPE_REGISTRATION_ENABLE, true) lazy val forceAvroEnable = PropUtils.getBoolean(this.FLINK_FORCE_AVRO_ENABLE, false) lazy val forceKryoEnable = PropUtils.getBoolean(this.FLINK_FORCE_KRYO_ENABLE, false) lazy val genericTypesEnable = PropUtils.getBoolean(this.FLINK_GENERIC_TYPES_ENABLE, false) lazy val objectReuseEnable = PropUtils.getBoolean(this.FLINK_OBJECT_REUSE_ENABLE, false) lazy val autoWatermarkInterval = PropUtils.getLong(this.FLINK_AUTO_WATERMARK_INTERVAL, -1) lazy val closureCleanerLevel = PropUtils.getString(this.FLINK_CLOSURE_CLEANER_LEVEL) lazy val defaultInputDependencyConstraint = PropUtils.getString(this.FLINK_DEFAULT_INPUT_DEPENDENCY_CONSTRAINT) lazy val executionMode = PropUtils.getString(this.FLINK_EXECUTION_MODE) lazy val latencyTrackingInterval = PropUtils.getLong(this.FLINK_LATENCY_TRACKING_INTERVAL, -1) lazy val maxParallelism = PropUtils.getInt(this.FLINK_MAX_PARALLELISM, 1024) lazy val defaultParallelism = PropUtils.getInt(this.FLINK_DEFAULT_PARALLELISM, -1) lazy val taskCancellationInterval = PropUtils.getLong(this.FLINK_TASK_CANCELLATION_INTERVAL, -1) lazy val taskCancellationTimeoutMillis = PropUtils.getLong(this.FLINK_TASK_CANCELLATION_TIMEOUT_MILLIS, -1) lazy val useSnapshotCompression = PropUtils.getBoolean(this.FLINK_USE_SNAPSHOT_COMPRESSION, false) lazy val streamBufferTimeoutMillis = PropUtils.getLong(this.FLINK_STREAM_BUFFER_TIMEOUT_MILLIS, -1) lazy val streamNumberExecutionRetries = PropUtils.getInt(this.FLINK_STREAM_NUMBER_EXECUTION_RETRIES, -1) lazy val streamTimeCharacteristic = PropUtils.getString(this.FLINK_STREAM_TIME_CHARACTERISTIC, "") lazy val sqlLogEnable = PropUtils.getBoolean(this.FLINK_SQL_LOG_ENABLE, false) lazy val unalignedCheckpointEnable = PropUtils.getBoolean(this.FLINK_STREAM_CHECKPOINT_UNALIGNED, true) lazy val distributeSyncEnabled = PropUtils.getBoolean(this.DISTRIBUTE_SYNC_ENABLE, true) // checkpoint相关配置项 lazy val streamCheckpointInterval = PropUtils.getLong(this.FLINK_STREAM_CHECKPOINT_INTERVAL, -1) lazy val streamCheckpointMode = PropUtils.getString(this.FLINK_STREAM_CHECKPOINT_MODE, "EXACTLY_ONCE") lazy val streamCheckpointTimeout = PropUtils.getLong(this.FLINK_STREAM_CHECKPOINT_TIMEOUT, 600000L) lazy val streamCheckpointMaxConcurrent = PropUtils.getInt(this.FLINK_STREAM_CHECKPOINT_MAX_CONCURRENT, 1) lazy val streamCheckpointMinPauseBetween = PropUtils.getInt(this.FLINK_STREAM_CHECKPOINT_MIN_PAUSE_BETWEEN, -1) lazy val streamCheckpointPreferRecovery = PropUtils.getBoolean(this.FLINK_STREAM_CHECKPOINT_PREFER_RECOVERY, false) lazy val streamCheckpointTolerableFailureNumber = PropUtils.getInt(this.FLINK_STREAM_CHECKPOINT_TOLERABLE_FAILURE_NUMBER, 0) lazy val streamCheckpointExternalized = PropUtils.getString(this.FLINK_STREAM_CHECKPOINT_EXTERNALIZED, "RETAIN_ON_CANCELLATION") // flink sql相关配置 lazy val flinkSqlConfig = PropUtils.sliceKeys(this.FLINK_SQL_CONF_PREFIX) // 用于自动注册udf jar包中的函数 lazy val flinkUdfList = PropUtils.sliceKeys(this.FLINK_SQL_UDF_CONF_PREFIX) // 是否启用fire udf注册功能 lazy val flinkUdfEnable = PropUtils.getBoolean(this.FLINK_SQL_UDF_ENABLE, true) // 运行模式 lazy val flinkRuntimeMode = PropUtils.getString(this.FLINK_RUNTIME_MODE, PropUtils.getString("execution.runtime-mode", "STREAMING")) // 默认的Keyed State的TTL时间 lazy val flinkStateTTL = PropUtils.getInt(this.FLINK_STATE_TTL_DAYS, 31) // 是否开启算子链合并 lazy val operatorChainingEnable = PropUtils.getBoolean(this.OPERATOR_CHAINING_ENABLE, true) // 是否自动将insert语句加入到StatementSet中 lazy val autoAddStatementSet = PropUtils.getBoolean(this.FLINK_SQL_AUTO_ADD_STATEMENT_SET, true) // 将配置的with options映射为map lazy val flinkSqlWithOptions = PropUtils.sliceKeys(FireFlinkConf.FLINK_SQL_WITH_PREFIX) // flink状态清理的hdfs路径前缀 lazy val stateHdfsUrl = PropUtils.getString(this.FLINK_STATE_CLEAN_HDFS_URL) } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/conf/FlinkAnnoManager.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.conf import com.zto.fire.common.conf.FireFrameworkConf.FIRE_JOB_AUTO_START import com.zto.fire.common.util.PropUtils import com.zto.fire.core.conf.AnnoManager import com.zto.fire.flink.anno.{Checkpoint, FlinkConf, Streaming} import com.zto.fire.flink.conf.FireFlinkConf._ /** * 注解管理器,用于将主键中的配置信息映射为键值对信息 * * @author ChengLong 2022-04-26 11:19:00 * @since 2.2.2 */ private[fire] class FlinkAnnoManager extends AnnoManager { /** * 将@Streaming中配置的信息映射为键值对形式 * @param streaming * Streaming注解实例 */ def mapStreaming(streaming: Streaming): Unit = { /** * 将时间单位由s转为ms */ def unitConversion(value: Int): Int = if (value > 0) value * 1000 else -1 this.put(FLINK_STREAM_CHECKPOINT_INTERVAL, unitConversion(streaming.value())) this.put(FLINK_STREAM_CHECKPOINT_INTERVAL, unitConversion(streaming.interval())) this.put(FLINK_STREAM_CHECKPOINT_TIMEOUT, unitConversion(streaming.timeout())) this.put(FLINK_STREAM_CHECKPOINT_MIN_PAUSE_BETWEEN, unitConversion(streaming.pauseBetween())) this.put(FLINK_STREAM_CHECKPOINT_UNALIGNED, streaming.unaligned()) this.put(FLINK_STREAM_CHECKPOINT_MAX_CONCURRENT, streaming.concurrent()) this.put(FLINK_STREAM_CHECKPOINT_TOLERABLE_FAILURE_NUMBER, streaming.failureNumber()) this.put(FLINK_STREAM_CHECKPOINT_MODE, streaming.mode()) this.put(streamCheckpointExternalized, streaming.cleanup()) this.put(FIRE_JOB_AUTO_START, streaming.autoStart()) this.put(FLINK_DEFAULT_PARALLELISM, streaming.parallelism()) this.put(OPERATOR_CHAINING_ENABLE, streaming.disableOperatorChaining()) this.put(FLINK_STATE_TTL_DAYS, streaming.stateTTL()) } /** * 将@Checkpoint中配置的信息映射为键值对形式 * @param checkpoint * Checkpoint注解实例 */ def mapCheckpoint(checkpoint: Checkpoint): Unit = { /** * 将时间单位由s转为ms */ def unitConversion(value: Int): Int = if (value > 0) value * 1000 else -1 this.put(FLINK_STREAM_CHECKPOINT_INTERVAL, unitConversion(checkpoint.value())) this.put(FLINK_STREAM_CHECKPOINT_INTERVAL, unitConversion(checkpoint.interval())) this.put(FLINK_STREAM_CHECKPOINT_TIMEOUT, unitConversion(checkpoint.timeout())) this.put(FLINK_STREAM_CHECKPOINT_MIN_PAUSE_BETWEEN, unitConversion(checkpoint.pauseBetween())) this.put(FLINK_STREAM_CHECKPOINT_UNALIGNED, checkpoint.unaligned()) this.put(FLINK_STREAM_CHECKPOINT_MAX_CONCURRENT, checkpoint.concurrent()) this.put(FLINK_STREAM_CHECKPOINT_TOLERABLE_FAILURE_NUMBER, checkpoint.failureNumber()) this.put(FLINK_STREAM_CHECKPOINT_MODE, checkpoint.mode()) this.put(streamCheckpointExternalized, checkpoint.cleanup()) } /** * 将@FlinkConf中配置的信息映射为键值对形式 */ def mapFlinkConf(flinkConf: FlinkConf): Unit = { val valueConf = PropUtils.parseTextConfig(flinkConf.value()) valueConf.foreach(kv => this.props.put(kv._1, kv._2)) flinkConf.props().foreach(prop => { val conf = prop.split("=", 2) if (conf != null && conf.length == 2) { this.props.put(conf(0), conf(1)) } }) } /** * 用于注册需要映射配置信息的自定义主键 */ override protected[fire] def register: Unit = { AnnoManager.registerAnnoSet.add(classOf[FlinkConf]) AnnoManager.registerAnnoSet.add(classOf[Streaming]) AnnoManager.registerAnnoSet.add(classOf[Checkpoint]) } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/ext/batch/BatchExecutionEnvExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.ext.batch import com.zto.fire.common.util.ValueUtils import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment} import scala.reflect.ClassTag /** * 用于flink ExecutionEnvironment API库扩展 * * @author ChengLong 2020年1月9日 13:52:16 * @since 0.4.1 */ class BatchExecutionEnvExt(env: ExecutionEnvironment) { /** * 提交job执行 * * @param jobName * job名称 */ def start(jobName: String = ""): Unit = { if (ValueUtils.isEmpty(jobName)) this.env.execute() else this.env.execute(jobName) } /** * 使用集合元素创建DataStream * @param seq * 元素集合 * @tparam T * 元素的类型 */ def parallelize[T: TypeInformation: ClassTag](seq: Seq[T], parallelism: Int = this.env.getParallelism): DataSet[T] = { this.env.fromCollection[T](seq).setParallelism(parallelism) } /** * 使用集合元素创建DataStream * @param seq * 元素集合 * @tparam T * 元素的类型 */ def createCollectionDataSet[T: TypeInformation: ClassTag](seq: Seq[T], parallelism: Int = this.env.getParallelism): DataSet[T] = this.parallelize[T](seq, parallelism) } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/ext/batch/BatchTableEnvExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.ext.batch import com.zto.fire.jdbc.JdbcConnectorBridge import org.apache.flink.table.api.{Table, TableEnvironment} /** * 用于flink BatchTableEnvironment API库扩展 * * @author ChengLong 2020年1月9日 13:52:16 * @since 0.4.1 */ class BatchTableEnvExt(env: TableEnvironment) extends JdbcConnectorBridge { /** * 执行sql query操作 * * @param sql * sql语句 * @return * table对象 */ def sql(sql: String): Table = { this.env.sqlQuery(sql) } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/ext/batch/DataSetExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.ext.batch import com.zto.fire.flink.util.FlinkSingletonFactory import org.apache.flink.api.scala.DataSet import org.apache.flink.table.api.{Table, TableEnvironment} /** * 用于对Flink DataSet的API库扩展 * * @author ChengLong 2020年1月15日 16:35:03 * @since 0.4.1 */ class DataSetExt[T](dataSet: DataSet[T]){ lazy val tableEnv = FlinkSingletonFactory.getTableEnv.asInstanceOf[TableEnvironment] /** * 将DataSet注册为临时表 * * @param tableName * 临时表的表名 */ def createOrReplaceTempView(tableName: String): Table = { val table = this.tableEnv.fromValues(this.dataSet) this.tableEnv.createTemporaryView(tableName, table) table } /** * 设置并行度 */ def repartition(parallelism: Int): DataSet[T] = { this.dataSet.setParallelism(parallelism) } /** * 将DataSet转为Table */ def toTable: Table = { this.tableEnv.fromValues(this.dataSet) } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/ext/function/RichFunctionExt.scala ================================================ package com.zto.fire.flink.ext.function import com.zto.fire._ import com.zto.fire.flink.conf.FireFlinkConf import org.apache.flink.api.common.functions.{AggregateFunction, ReduceFunction, RichFunction} import org.apache.flink.api.common.state.{AggregatingState, AggregatingStateDescriptor, ListState, ListStateDescriptor, MapState, MapStateDescriptor, ReducingState, ReducingStateDescriptor, State, StateTtlConfig, ValueState, ValueStateDescriptor} import org.apache.flink.api.common.time.Time import java.io.File import scala.reflect.ClassTag /** * RichFunction api扩展,支持方便的获取状态数据 * * @author ChengLong 2021-9-14 09:59:17 * @since 2.2.0 */ class RichFunctionExt(richFunction: RichFunction) { lazy val runtimeContext = richFunction.getRuntimeContext private[this] lazy val stateMap = new JConcurrentHashMap[String, State]() // 默认的状态TTL配置(flink.state.ttl.days) private[this] lazy val defaultTTLConfig = StateTtlConfig .newBuilder(Time.days(FireFlinkConf.flinkStateTTL)) .setUpdateType(StateTtlConfig.UpdateType.OnReadAndWrite) .setStateVisibility(StateTtlConfig.StateVisibility.NeverReturnExpired) .build() /** * 根据name获取广播变量 * * @param name 广播变量名称 * @tparam T * 广播变量的类型 * @return * 广播变量引用 */ def getBroadcastVariable[T](name: String): Seq[T] = { requireNonEmpty(name)("广播变量名称不能为空") this.runtimeContext.getBroadcastVariable[T](name) } /** * 将值添加到指定的累加器中 * * @param name * 累加器名称 * @param value * 待累加的值 * @tparam T * 累加值的类型(Int/Long/Double) */ def addCounter[T: ClassTag](name: String, value: T): Unit = { requireNonEmpty(name, value) getParamType[T] match { case valueType if valueType eq classOf[Int] => this.runtimeContext.getIntCounter(name).add(value.asInstanceOf[Int]) case valueType if valueType eq classOf[Long] => this.runtimeContext.getLongCounter(name).add(value.asInstanceOf[Long]) case valueType if valueType eq classOf[Double] => this.runtimeContext.getDoubleCounter(name).add(value.asInstanceOf[Double]) } } /** * 根据文件名获取分布式缓存文件 * * @param fileName 缓存文件名称 * @return 被缓存的文件 */ def DistributedCache(fileName: String): File = { requireNonEmpty(fileName)("分布式缓存文件名称不能为空!") this.runtimeContext.getDistributedCache.getFile(fileName) } /** * 根据name获取ValueState */ def getState[T: ClassTag](name: String, ttlConfig: StateTtlConfig = this.defaultTTLConfig): ValueState[T] = { this.stateMap.mergeGet(name) { val desc = new ValueStateDescriptor[T](name, getParamType[T]) if (ttlConfig != null) desc.enableTimeToLive(ttlConfig) this.runtimeContext.getState[T](desc) }.asInstanceOf[ValueState[T]] } /** * 根据name获取ListState */ def getListState[T: ClassTag](name: String, ttlConfig: StateTtlConfig = this.defaultTTLConfig): ListState[T] = { this.stateMap.mergeGet(name) { val desc = new ListStateDescriptor[T](name, getParamType[T]) if (ttlConfig != null) desc.enableTimeToLive(ttlConfig) this.runtimeContext.getListState[T](desc) }.asInstanceOf[ListState[T]] } /** * 根据name获取MapState */ def getMapState[K: ClassTag, V: ClassTag](name: String, ttlConfig: StateTtlConfig = this.defaultTTLConfig): MapState[K, V] = { this.stateMap.mergeGet(name) { val desc = new MapStateDescriptor[K, V](name, getParamType[K], getParamType[V]) if (ttlConfig != null) desc.enableTimeToLive(ttlConfig) this.runtimeContext.getMapState[K, V](desc) }.asInstanceOf[MapState[K, V]] } /** * 根据name获取ReducingState */ def getReducingState[T: ClassTag](name: String, reduceFun: (T, T) => T, ttlConfig: StateTtlConfig = this.defaultTTLConfig): ReducingState[T] = { this.stateMap.mergeGet(name) { val desc = new ReducingStateDescriptor[T](name, new ReduceFunction[T] { override def reduce(value1: T, value2: T): T = reduceFun(value1, value2) }, getParamType[T]) if (ttlConfig != null) desc.enableTimeToLive(ttlConfig) this.runtimeContext.getReducingState[T](desc) }.asInstanceOf[ReducingState[T]] } /** * 根据name获取AggregatingState */ def getAggregatingState[I, T: ClassTag, O](name: String, aggFunction: AggregateFunction[I, T, O], ttlConfig: StateTtlConfig = this.defaultTTLConfig): AggregatingState[I, O] = { this.stateMap.mergeGet(name) { val desc = new AggregatingStateDescriptor[I, T, O](name, aggFunction, getParamType[T]) if (ttlConfig != null) desc.enableTimeToLive(ttlConfig) this.runtimeContext.getAggregatingState(desc) }.asInstanceOf[AggregatingState[I, O]] } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/ext/function/RuntimeContextExt.scala ================================================ package com.zto.fire.flink.ext.function import com.zto.fire.common.util.Logging import org.apache.flink.api.common.functions.RuntimeContext /** * RuntimeContext扩展 * * @author ChengLong 2021-9-13 14:26:28 * @since 2.2.0 */ class RuntimeContextExt(runtimeContext: RuntimeContext) extends Logging { } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/ext/provider/HBaseConnectorProvider.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.ext.provider import com.zto.fire._ import com.zto.fire.hbase.bean.HBaseBaseBean import org.apache.flink.streaming.api.datastream.DataStreamSink import org.apache.flink.streaming.api.scala.DataStream import org.apache.flink.table.api.Table import org.apache.flink.types.Row import scala.reflect.ClassTag /** * 为上层扩展层提供HBaseConnector API * * @author ChengLong * @since 2.0.0 * @create 2020-12-24 10:16 */ trait HBaseConnectorProvider { /** * hbase批量sink操作,DataStream[T]中的T必须是HBaseBaseBean的子类 * * @param tableName * hbase表名 * @param batch * 每次sink最大的记录数 * @param flushInterval * 多久flush一次(毫秒) * @param keyNum * 配置文件中的key后缀 */ def hbasePutDS[T <: HBaseBaseBean[T]: ClassTag](stream: DataStream[T], tableName: String, batch: Int = 100, flushInterval: Long = 3000, keyNum: Int = 1): DataStreamSink[_] = { stream.hbasePutDS(tableName, batch, flushInterval, keyNum) } /** * hbase批量sink操作,DataStream[T]中的T必须是HBaseBaseBean的子类 * * @param tableName * hbase表名 * @param batch * 每次sink最大的记录数 * @param flushInterval * 多久flush一次(毫秒) * @param keyNum * 配置文件中的key后缀 * @param fun * 将dstream中的数据映射为该sink组件所能处理的数据 */ def hbasePutDS2[T <: HBaseBaseBean[T] : ClassTag](stream: DataStream[T], tableName: String, batch: Int = 100, flushInterval: Long = 3000, keyNum: Int = 1)(fun: T => T): DataStreamSink[_] = { stream.hbasePutDS2[T](tableName, batch, flushInterval, keyNum)(fun) } /** * table的hbase批量sink操作,该api需用户定义row的取数规则,并映射到对应的HBaseBaseBean的子类中 * * @param tableName * HBase表名 * @param batch * 每次sink最大的记录数 * @param flushInterval * 多久flush一次(毫秒) * @param keyNum * 配置文件中的key后缀 */ def hbasePutTable[T <: HBaseBaseBean[T]: ClassTag](table: Table, tableName: String, batch: Int = 100, flushInterval: Long = 3000, keyNum: Int = 1): DataStreamSink[_] = { table.hbasePutTable[T](tableName, batch, flushInterval, keyNum) } /** * table的hbase批量sink操作,该api需用户定义row的取数规则,并映射到对应的HBaseBaseBean的子类中 * * @param tableName * HBase表名 * @param batch * 每次sink最大的记录数 * @param flushInterval * 多久flush一次(毫秒) * @param keyNum * 配置文件中的key后缀 */ def hbasePutTable2[T <: HBaseBaseBean[T]: ClassTag](table: Table, tableName: String, batch: Int = 100, flushInterval: Long = 3000, keyNum: Int = 1)(fun: Row => T): DataStreamSink[_] = { table.hbasePutTable2[T](tableName, batch, flushInterval, keyNum)(fun) } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/ext/provider/JdbcFlinkProvider.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.ext.provider import com.zto.fire._ import org.apache.flink.streaming.api.datastream.DataStreamSink import org.apache.flink.streaming.api.scala.DataStream import org.apache.flink.table.api.Table import org.apache.flink.types.Row /** * 为上层扩展层提供JDBC相关API * * @author ChengLong * @since 2.0.0 * @create 2020-12-24 10:18 */ trait JdbcFlinkProvider { /** * jdbc批量sink操作,根据用户指定的DataStream中字段的顺序,依次填充到sql中的占位符所对应的位置 * 注: * 1. fieldList指定DataStream中JavaBean的字段名称,非jdbc表中的字段名称 * 2. fieldList多个字段使用逗号分隔 * 3. fieldList中的字段顺序要与sql中占位符顺序保持一致,数量一致 * * @param sql * 增删改sql * @param fields * DataStream中数据的每一列的列名(非数据库中的列名,需与sql中占位符的顺序一致) * @param batch * 每次sink最大的记录数 * @param flushInterval * 多久flush一次(毫秒) * @param keyNum * 配置文件中的key后缀 */ def jdbcBatchUpdateStream[T](stream: DataStream[T], sql: String, fields: Seq[String], batch: Int = 10, flushInterval: Long = 1000, keyNum: Int = 1): DataStreamSink[T] = { stream.jdbcBatchUpdate(sql, fields, batch, flushInterval, keyNum) } /** * jdbc批量sink操作 * * @param sql * 增删改sql * @param batch * 每次sink最大的记录数 * @param flushInterval * 多久flush一次(毫秒) * @param keyNum * 配置文件中的key后缀 * @param fun * 将dstream中的数据映射为该sink组件所能处理的数据 */ def jdbcBatchUpdateStream2[T](stream: DataStream[T], sql: String, batch: Int = 10, flushInterval: Long = 1000, keyNum: Int = 1)(fun: T => Seq[Any]): DataStreamSink[T] = { stream.jdbcBatchUpdate2(sql, batch, flushInterval, keyNum)(fun) } /** * table的jdbc批量sink操作,根据用户指定的Row中字段的顺序,依次填充到sql中的占位符所对应的位置 * 注: * 1. Row中的字段顺序要与sql中占位符顺序保持一致,数量一致 * 2. 目前仅处理Retract中的true消息,用户需手动传入merge语句 * * @param sql * 增删改sql * @param batch * 每次sink最大的记录数 * @param flushInterval * 多久flush一次(毫秒) * @param keyNum * 配置文件中的key后缀 */ def jdbcBatchUpdateTable(table: Table, sql: String, batch: Int = 10, flushInterval: Long = 1000, isMerge: Boolean = true, keyNum: Int = 1): DataStreamSink[Row] = { table.jdbcBatchUpdate(sql, batch, flushInterval, isMerge, keyNum) } /** * table的jdbc批量sink操作,该api需用户定义row的取数规则,并与sql中的占位符对等 * * @param sql * 增删改sql * @param batch * 每次sink最大的记录数 * @param flushInterval * 多久flush一次(毫秒) * @param keyNum * 配置文件中的key后缀 */ def jdbcBatchUpdateTable2(table: Table, sql: String, batch: Int = 10, flushInterval: Long = 1000, isMerge: Boolean = true, keyNum: Int = 1)(fun: Row => Seq[Any]): DataStreamSink[Row] = { table.jdbcBatchUpdate2(sql, batch, flushInterval, isMerge, keyNum)(fun) } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/ext/stream/DataStreamExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.ext.stream import java.lang.reflect.Field import com.zto.fire.common.util.ReflectionUtils import com.zto.fire.flink.sink.{HBaseSink, JdbcSink} import com.zto.fire.flink.util.FlinkSingletonFactory import com.zto.fire.hbase.bean.HBaseBaseBean import com.zto.fire._ import com.zto.fire.hbase.HBaseConnector import org.apache.commons.lang3.StringUtils import org.apache.flink.api.common.accumulators.SimpleAccumulator import org.apache.flink.api.common.functions.RichMapFunction import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.configuration.Configuration import org.apache.flink.streaming.api.datastream.DataStreamSink import org.apache.flink.streaming.api.scala.function.AllWindowFunction import org.apache.flink.streaming.api.scala.{DataStream, _} import org.apache.flink.streaming.api.windowing.windows.GlobalWindow import org.apache.flink.table.api.Table import org.apache.flink.table.api.bridge.scala._ import org.apache.flink.types.Row import org.apache.flink.util.Collector import scala.collection.mutable.ListBuffer import scala.reflect.ClassTag /** * 用于对Flink DataStream的API库扩展 * * @author ChengLong 2020年1月7日 09:18:21 * @since 0.4.1 */ class DataStreamExt[T](stream: DataStream[T]) { lazy val tableEnv = FlinkSingletonFactory.getTableEnv.asInstanceOf[StreamTableEnvironment] /** * 将流注册为临时表 * * @param tableName * 临时表的表名 */ def createOrReplaceTempView(tableName: String): Table = { val table = this.stream.toTable(this.tableEnv) this.tableEnv.createTemporaryView(tableName, table) table } /** * 为当前DataStream设定uid与name * * @param uid * uid * @param name * name * @return * 当前实例 */ def uname(uid: String, name: String = ""): DataStream[T] = { if (StringUtils.isNotBlank(uid)) stream.uid(uid) if (StringUtils.isNotBlank(name)) stream.name(name) this.stream } /** * 预先注册flink累加器 * * @param acc * 累加器实例 * @param name * 累加器名称 * @return * 注册累加器之后的流 */ def registerAcc(acc: SimpleAccumulator[_], name: String): DataStream[String] = { this.stream.map(new RichMapFunction[T, String] { override def open(parameters: Configuration): Unit = { this.getRuntimeContext.addAccumulator(name, acc) } override def map(value: T): String = value.toString }) } /** * 将流映射为批流 * * @param count * 将指定数量的合并为一个集合 */ def countWindowSimple[T: ClassTag](count: Long): DataStream[List[T]] = { implicit val typeInfo = TypeInformation.of(classOf[List[T]]) stream.asInstanceOf[DataStream[T]].countWindowAll(Math.abs(count)).apply(new AllWindowFunction[T, List[T], GlobalWindow]() { override def apply(window: GlobalWindow, input: Iterable[T], out: Collector[List[T]]): Unit = { out.collect(input.toList) } })(typeInfo) } /** * 设置并行度 */ def repartition(parallelism: Int): DataStream[T] = { this.stream.setParallelism(parallelism) } /** * 将DataStream转为Table */ def toTable: Table = { this.tableEnv.fromDataStream(this.stream) } /** * jdbc批量sink操作,根据用户指定的DataStream中字段的顺序,依次填充到sql中的占位符所对应的位置 * 若DataStream为DataStream[Row]类型,则fields可以为空,但此时row中每列的顺序要与sql占位符顺序一致,数量和类型也要一致 * 注: * 1. fieldList指定DataStream中JavaBean的字段名称,非jdbc表中的字段名称 * 2. fieldList多个字段使用逗号分隔 * 3. fieldList中的字段顺序要与sql中占位符顺序保持一致,数量一致 * * @param sql * 增删改sql * @param fields * DataStream中数据的每一列的列名(非数据库中的列名,需与sql中占位符的顺序一致) * @param batch * 每次sink最大的记录数 * @param flushInterval * 多久flush一次(毫秒) * @param keyNum * 配置文件中的key后缀 */ def jdbcBatchUpdate(sql: String, fields: Seq[String], batch: Int = 10, flushInterval: Long = 1000, keyNum: Int = 1): DataStreamSink[T] = { this.stream.addSink(new JdbcSink[T](sql, batch = batch, flushInterval = flushInterval, keyNum = keyNum) { var fieldMap: java.util.Map[String, Field] = _ var clazz: Class[_] = _ override def map(value: T): Seq[Any] = { requireNonEmpty(sql)("sql语句不能为空") val params = ListBuffer[Any]() if (value.isInstanceOf[Row] || value.isInstanceOf[Tuple2[Boolean, Row]]) { // 如果是Row类型的DataStream[Row] val row = if (value.isInstanceOf[Row]) value.asInstanceOf[Row] else value.asInstanceOf[Tuple2[Boolean, Row]]._2 for (i <- 0 until row.getArity) { params += row.getField(i) } } else { requireNonEmpty(fields)("字段列表不能为空!需按照sql中的占位符顺序依次指定当前DataStream中数据字段的名称") if (clazz == null && value != null) { clazz = value.getClass fieldMap = ReflectionUtils.getAllFields(clazz) } fields.foreach(fieldName => { val field = this.fieldMap.get(StringUtils.trim(fieldName)) requireNonEmpty(field)(s"当前DataStream中不存在该列名$fieldName,请检查!") params += field.get(value) }) } params } }).name("fire jdbc stream sink") } /** * jdbc批量sink操作 * * @param sql * 增删改sql * @param batch * 每次sink最大的记录数 * @param flushInterval * 多久flush一次(毫秒) * @param keyNum * 配置文件中的key后缀 * @param fun * 将dstream中的数据映射为该sink组件所能处理的数据 */ def jdbcBatchUpdate2(sql: String, batch: Int = 10, flushInterval: Long = 1000, keyNum: Int = 1)(fun: T => Seq[Any]): DataStreamSink[T] = { this.stream.addSink(new JdbcSink[T](sql, batch = batch, flushInterval = flushInterval, keyNum = keyNum) { override def map(value: T): Seq[Any] = { fun(value) } }).name("fire jdbc stream sink") } /** * hbase批量sink操作,DataStream[T]中的T必须是HBaseBaseBean的子类 * * @param tableName * hbase表名 * @param batch * 每次sink最大的记录数 * @param flushInterval * 多久flush一次(毫秒) * @param keyNum * 配置文件中的key后缀 */ def hbasePutDS[E <: HBaseBaseBean[E] : ClassTag](tableName: String, batch: Int = 100, flushInterval: Long = 3000, keyNum: Int = 1): DataStreamSink[_] = { this.hbasePutDS2[E](tableName, batch, flushInterval, keyNum) { value => { value.asInstanceOf[E] } } } /** * hbase批量sink操作,DataStream[T]中的T必须是HBaseBaseBean的子类 * * @param tableName * hbase表名 * @param batch * 每次sink最大的记录数 * @param flushInterval * 多久flush一次(毫秒) * @param keyNum * 配置文件中的key后缀 * @param fun * 将dstream中的数据映射为该sink组件所能处理的数据 */ def hbasePutDS2[E <: HBaseBaseBean[E] : ClassTag](tableName: String, batch: Int = 100, flushInterval: Long = 3000, keyNum: Int = 1)(fun: T => E): DataStreamSink[_] = { HBaseConnector.checkClass[E]() this.stream.addSink(new HBaseSink[T, E](tableName, batch, flushInterval, keyNum) { /** * 将数据构建成sink的格式 */ override def map(value: T): E = fun(value) }).name("fire hbase stream sink") } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/ext/stream/KeyedStreamExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.ext.stream import org.apache.flink.streaming.api.TimeCharacteristic import org.apache.flink.streaming.api.scala.{KeyedStream, WindowedStream} import org.apache.flink.streaming.api.windowing.assigners._ import org.apache.flink.streaming.api.windowing.time.Time import org.apache.flink.streaming.api.windowing.windows.Window /** * 用于对Flink KeyedStream的API库扩展 * * @author ChengLong * @since 2.0.0 * @create 2021-01-15 10:20 */ class KeyedStreamExt[T, K](keyedStream: KeyedStream[T, K]) { /** * 创建滑动窗口 * * @param size * 窗口的大小 * @param slide * 窗口滑动间隔 * @param offset * 时区 * @param timeCharacteristic 时间类别 */ def slidingTimeWindow[W <: Window](size: Time, slide: Time, offset: Time = Time.milliseconds(0), timeCharacteristic: TimeCharacteristic = TimeCharacteristic.ProcessingTime): WindowedStream[T, K, W] = { if (timeCharacteristic == TimeCharacteristic.EventTime) { keyedStream.window(SlidingEventTimeWindows.of(size, slide, offset).asInstanceOf[WindowAssigner[T, W]]) } else { keyedStream.window(SlidingProcessingTimeWindows.of(size, slide, offset).asInstanceOf[WindowAssigner[T, W]]) } } /** * 创建滚动窗口窗口 * * @param size * 窗口的大小 * @param offset * 时区 * @param timeCharacteristic 时间类别 */ def tumblingTimeWindow[W <: Window](size: Time, offset: Time = Time.milliseconds(0), timeCharacteristic: TimeCharacteristic = TimeCharacteristic.ProcessingTime): WindowedStream[T, K, W] = { if (timeCharacteristic == TimeCharacteristic.EventTime) { keyedStream.window(TumblingEventTimeWindows.of(size, offset).asInstanceOf[WindowAssigner[T, W]]) } else { keyedStream.window(TumblingProcessingTimeWindows.of(size, offset).asInstanceOf[WindowAssigner[T, W]]) } } /** * 创建session会话窗口 * * @param size * 超时时间 * @param timeCharacteristic 时间类别 */ def sessionTimeWindow[W <: Window](size: Time, timeCharacteristic: TimeCharacteristic = TimeCharacteristic.ProcessingTime): WindowedStream[T, K, W] = { if (timeCharacteristic == TimeCharacteristic.EventTime) { keyedStream.window(EventTimeSessionWindows.withGap(size).asInstanceOf[WindowAssigner[T, W]]) } else { keyedStream.window(ProcessingTimeSessionWindows.withGap(size).asInstanceOf[WindowAssigner[T, W]]) } } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/ext/stream/RowExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.ext.stream import com.zto.fire.flink.bean.FlinkTableSchema import com.zto.fire.flink.util.FlinkUtils import org.apache.flink.types.Row /** * 用于flink Row API库扩展 * * @author ChengLong 2020年3月30日 17:00:05 * @since 0.4.1 */ class RowExt(row: Row) { /** * 将flink的row转为指定类型的JavaBean * @param schema * 表的schema * @param clazz * 目标JavaBean类型 */ def rowToBean[T](schema: FlinkTableSchema, clazz: Class[T]): T = { FlinkUtils.rowToBean(schema, row, clazz) } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/ext/stream/SQLExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.ext.stream import com.zto.fire.common.util.{Logging, PropUtils} import com.zto.fire.flink.conf.FireFlinkConf import com.zto.fire.{noEmpty, requireNonEmpty} import com.zto.fire._ /** * Flink SQL扩展类 * * @author ChengLong 2021-4-23 10:36:49 * @since 2.0.0 */ class SQLExt(sql: String) extends Logging { // 用于匹配Flink SQL中的with表达式 private[this] lazy val withPattern = """(with|WITH)\s*\(([\s\S]*)(\)|;)$""".r // 用于匹配Flink SQL中的create语句 private[this] lazy val createTablePattern = """^\s*(create|CREATE)\s+(table|TABLE)""".r private[this] lazy val withMapCache = new JConcurrentHashMap[Int, Map[String, String]]() /** * 将给定的不包含with表达式的Flink SQL添加with表达式 * * @param keyNum * with表达式在配置文件中声明的keyNum,小于零时,则表示不拼接with表达式 * @return * 组装了with表达式的Flink SQL文本 */ def with$(keyNum: Int = 1): String = { requireNonEmpty(sql, "sql语句不能为空!") // 未开启with表达式替换或keyNum不合法,则直接返回 if (!FireFlinkConf.sqlWithReplaceModeEnable || keyNum < 1) return sql // 仅匹配create table语句,进行with表达式处理 val createTableMatcher = this.createTablePattern.findFirstIn(sql) // 非create table语句,直接返回 if (createTableMatcher.isEmpty) return sql // 匹配sql中的with表达式,如果sql中已经定义了with表达式,则不做替换 val withMatcher = withPattern.findFirstIn(sql) if (withMatcher.isDefined) return sql // 从配置文件中获取指定keyNum的with参数,keyNum确定唯一的sql语句 val withMap = withMapCache.getOrElse(keyNum, PropUtils.sliceKeysByNum(FireFlinkConf.FLINK_SQL_WITH_PREFIX, keyNum)) // 当sql语句中没有指定with表达式并且没有配置with参数,则进行提示 if (withMap.isEmpty && withMatcher.isEmpty) { this.logger.error(s"未搜索到keyNum=${keyNum}对应的sql配置列表,请以${FireFlinkConf.FLINK_SQL_WITH_PREFIX}开头,以${keyNum}结尾进行配置") return sql } // 替换create table语句中的with表达式,并返回最终的sql val fixSql = if (withMatcher.isDefined) withPattern.replaceAllIn(sql, "") else sql val finalSQL = buildWith(fixSql, withMap) if (FireFlinkConf.sqlLogEnable) logger.debug(s"完整SQL语句:$finalSQL") finalSQL } /** * 根据给定的配置列表构建Flink SQL with表达式 * * @param map * Flink SQL with配置列表 * @return * with sql表达式 */ private[fire] def buildWith(sql: String, map: Map[String, String]): String = { val withSql = new StringBuilder() withSql.append(sql).append("WITH (\n") map.filter(conf => noEmpty(conf, conf._1, conf._2)).foreach(conf => { withSql .append(s"""\t'${conf._1}'=""") .append(s"'${conf._2}'") .append(",\n") }) withSql.substring(0, withSql.length - 2) + "\n)" } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/ext/stream/StreamExecutionEnvExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.ext.stream import com.zto.fire._ import com.zto.fire.common.enu.{Operation => FOperation} import com.zto.fire.common.conf.{FireKafkaConf, FireRocketMQConf} import com.zto.fire.common.util.{KafkaUtils, LineageManager, RegularUtils, SQLUtils} import com.zto.fire.core.Api import com.zto.fire.flink.ext.provider.{HBaseConnectorProvider, JdbcFlinkProvider} import com.zto.fire.flink.sql.FlinkSqlExtensionsParser import com.zto.fire.flink.util.{FlinkSingletonFactory, FlinkUtils, RocketMQUtils} import com.zto.fire.jdbc.JdbcConnectorBridge import org.apache.commons.lang3.StringUtils import org.apache.flink.api.common.functions.RuntimeContext import org.apache.flink.api.common.serialization.SimpleStringSchema import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.scala._ import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment} import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer import org.apache.flink.streaming.connectors.kafka.internals.KafkaTopicPartition import org.apache.flink.streaming.util.serialization.JSONKeyValueDeserializationSchema import org.apache.flink.table.api.{StatementSet, Table, TableResult} import org.apache.rocketmq.flink.common.serialization.SimpleTagKeyValueDeserializationSchema import org.apache.rocketmq.flink.{RocketMQConfig, RocketMQSourceWithTag} import java.util.Properties import java.util.concurrent.atomic.AtomicBoolean import scala.collection.JavaConversions import scala.reflect.ClassTag /** * 用于对Flink StreamExecutionEnvironment的API库扩展 * * @author ChengLong 2020年1月7日 09:18:21 * @since 0.4.1 */ class StreamExecutionEnvExt(env: StreamExecutionEnvironment) extends Api with TableApi with JdbcConnectorBridge with HBaseConnectorProvider with JdbcFlinkProvider { private[fire] lazy val tableEnv = FlinkSingletonFactory.getTableEnv /** * 创建Socket流 */ def createSocketTextStream(hostname: String, port: Int, delimiter: Char = '\n', maxRetry: Long = 0): DataStream[String] = { this.env.socketTextStream(hostname, port, delimiter, maxRetry) } /** * 根据配置信息创建Kafka Consumer * * @param kafkaParams * kafka相关的配置参数 * @return * FlinkKafkaConsumer011 */ def createKafkaConsumer[T](kafkaParams: Map[String, Object] = null, topics: Set[String] = null, deserializer: Any = new SimpleStringSchema, keyNum: Int = 1): FlinkKafkaConsumer[T] = { val confTopics = FireKafkaConf.kafkaTopics(keyNum) val topicList = if (StringUtils.isNotBlank(confTopics)) confTopics.split(",") else if (topics != null) topics.toArray else null require(topicList != null && topicList.nonEmpty, s"kafka topic不能为空,请在配置文件中指定:kafka.topics$keyNum") val confKafkaParams = KafkaUtils.kafkaParams(kafkaParams, FlinkSingletonFactory.getAppName, keyNum = keyNum) // 配置文件中相同的key优先级高于代码中的 require(confKafkaParams.nonEmpty, "kafka相关配置不能为空!") require(confKafkaParams.contains("bootstrap.servers"), s"kafka bootstrap.servers不能为空,请在配置文件中指定:kafka.brokers.name$keyNum") require(confKafkaParams.contains("group.id"), s"kafka group.id不能为空,请在配置文件中指定:kafka.group.id$keyNum") require(deserializer != null, "deserializer不能为空,默认SimpleStringSchema") val properties = new Properties() confKafkaParams.foreach(t => properties.setProperty(t._1, t._2.toString)) // 添加topic列表信息 val topicsStr = topicList.mkString("", ", ", "") properties.setProperty("kafka.topics", topicsStr) // 添加二次开发相关配置信息 properties.setProperty(FireKafkaConf.KAFKA_OVERWRITE_STATE_OFFSET, FireKafkaConf.kafkaForceOverwriteStateOffset.toString) properties.setProperty(FireKafkaConf.KAFKA_FORCE_AUTO_COMMIT, FireKafkaConf.kafkaForceCommit.toString) properties.setProperty(FireKafkaConf.KAFKA_FORCE_AUTO_COMMIT_INTERVAL, FireKafkaConf.kafkaForceCommitInterval.toString) // 消费kafka埋点信息 LineageManager.addMQDatasource("kafka", confKafkaParams("bootstrap.servers").toString, topicsStr, confKafkaParams("group.id").toString, FOperation.SOURCE) deserializer match { case schema: JSONKeyValueDeserializationSchema => new FlinkKafkaConsumer[ObjectNode](JavaConversions.seqAsJavaList(topicList.map(topic => StringUtils.trim(topic))), schema, properties).asInstanceOf[FlinkKafkaConsumer[T]] case _ => new FlinkKafkaConsumer[String](JavaConversions.seqAsJavaList(topicList.map(topic => StringUtils.trim(topic))), new SimpleStringSchema, properties).asInstanceOf[FlinkKafkaConsumer[T]] } } /** * 可指定支持的deserializer创建DStream流 * * @param kafkaParams * kafka相关的配置参数 * @return * DStream */ def createDirectStreamBySchema[T: TypeInformation : ClassTag](kafkaParams: Map[String, Object] = null, topics: Set[String] = null, specificStartupOffsets: Map[KafkaTopicPartition, java.lang.Long] = null, runtimeContext: RuntimeContext = null, deserializer: Any = new SimpleStringSchema, keyNum: Int = 1): DataStream[T] = { val kafkaConsumer = this.createKafkaConsumer[T](kafkaParams, topics, deserializer, keyNum) if (runtimeContext != null) kafkaConsumer.setRuntimeContext(runtimeContext) if (specificStartupOffsets != null) kafkaConsumer.setStartFromSpecificOffsets(specificStartupOffsets) // 设置从指定时间戳位置开始消费kafka val startFromTimeStamp = FireKafkaConf.kafkaStartFromTimeStamp(keyNum) if (startFromTimeStamp > 0) kafkaConsumer.setStartFromTimestamp(FireKafkaConf.kafkaStartFromTimeStamp(keyNum)) // 是否在checkpoint时记录offset值 kafkaConsumer.setCommitOffsetsOnCheckpoints(FireKafkaConf.kafkaCommitOnCheckpoint(keyNum)) // 设置从最早的位置开始消费 if (FireKafkaConf.offsetSmallest.equalsIgnoreCase(FireKafkaConf.kafkaStartingOffset(keyNum))) kafkaConsumer.setStartFromEarliest() // 设置从最新位置开始消费 if (FireKafkaConf.offsetLargest.equalsIgnoreCase(FireKafkaConf.kafkaStartingOffset(keyNum))) kafkaConsumer.setStartFromLatest() // 从topic中指定的group上次消费的位置开始消费,必须配置group.id参数 if (FireKafkaConf.kafkaStartFromGroupOffsets(keyNum)) kafkaConsumer.setStartFromGroupOffsets() this.env.addSource(kafkaConsumer) } /** * 创建DStream流 * * @param kafkaParams * kafka相关的配置参数 * @return * DStream */ def createDirectStream(kafkaParams: Map[String, Object] = null, topics: Set[String] = null, specificStartupOffsets: Map[KafkaTopicPartition, java.lang.Long] = null, runtimeContext: RuntimeContext = null, keyNum: Int = 1): DataStream[String] = { this.createDirectStreamBySchema[String](kafkaParams, topics, specificStartupOffsets, runtimeContext, keyNum = keyNum) } /** * 基于指定的schema创建DStream流 * * @param kafkaParams * kafka相关的配置参数 * @return * DStream */ def createDirectStreamByJsonKeyValue(kafkaParams: Map[String, Object] = null, topics: Set[String] = null, specificStartupOffsets: Map[KafkaTopicPartition, java.lang.Long] = null, runtimeContext: RuntimeContext = null, keyNum: Int = 1): DataStream[ObjectNode] = { this.createDirectStreamBySchema[ObjectNode](kafkaParams, topics, specificStartupOffsets, runtimeContext, new JSONKeyValueDeserializationSchema(true), keyNum) } /** * 创建DStream流,以SimpleStringSchema进行反序列化 * * @param kafkaParams * kafka相关的配置参数 * @return * DStream */ def createKafkaDirectStream(kafkaParams: Map[String, Object] = null, topics: Set[String] = null, specificStartupOffsets: Map[KafkaTopicPartition, java.lang.Long] = null, runtimeContext: RuntimeContext = null, keyNum: Int = 1): DataStream[String] = { this.createDirectStream(kafkaParams, topics, specificStartupOffsets, runtimeContext, keyNum) } /** * 创建DStream流,以JSONKeyValueDeserializationSchema进行反序列化 * * @param kafkaParams * kafka相关的配置参数 * @return * DStream */ def createKafkaDirectStreamByJsonKeyValue(kafkaParams: Map[String, Object] = null, topics: Set[String] = null, specificStartupOffsets: Map[KafkaTopicPartition, java.lang.Long] = null, runtimeContext: RuntimeContext = null, keyNum: Int = 1): DataStream[ObjectNode] = { this.createDirectStreamByJsonKeyValue(kafkaParams, topics, specificStartupOffsets, runtimeContext, keyNum) } /** * 构建RocketMQ拉取消息的DStream流,获取消息中的tag、key以及value * * @param rocketParam * rocketMQ相关消费参数 * @param groupId * groupId * @param topics * topic列表 * @return * rocketMQ DStream */ def createRocketMqPullStreamWithTag(rocketParam: Map[String, String] = null, groupId: String = null, topics: String = null, tag: String = null, keyNum: Int = 1): DataStream[(String, String, String)] = { // 获取topic信息,配置文件优先级高于代码中指定的 val confTopics = FireRocketMQConf.rocketTopics(keyNum) val finalTopics = if (StringUtils.isNotBlank(confTopics)) confTopics else topics require(StringUtils.isNotBlank(finalTopics), s"RocketMQ的Topics不能为空,请在配置文件中指定:rocket.topics$keyNum") // groupId信息 val confGroupId = FireRocketMQConf.rocketGroupId(keyNum) val finalGroupId = if (StringUtils.isNotBlank(confGroupId)) confGroupId else groupId require(StringUtils.isNotBlank(finalGroupId), s"RocketMQ的groupId不能为空,请在配置文件中指定:rocket.group.id$keyNum") // 详细的RocketMQ配置信息 val finalRocketParam = RocketMQUtils.rocketParams(rocketParam, finalTopics, finalGroupId, rocketNameServer = null, tag = tag, keyNum) require(!finalRocketParam.isEmpty, "RocketMQ相关配置不能为空!") require(finalRocketParam.containsKey(RocketMQConfig.NAME_SERVER_ADDR), s"RocketMQ nameserver.address不能为空,请在配置文件中指定:rocket.brokers.name$keyNum") // 消费rocketmq埋点信息 LineageManager.addMQDatasource("rocketmq", finalRocketParam(RocketMQConfig.NAME_SERVER_ADDR), finalTopics, finalGroupId, FOperation.SOURCE) val props = new Properties() props.putAll(finalRocketParam) this.env.addSource(new RocketMQSourceWithTag[(String, String, String)](new SimpleTagKeyValueDeserializationSchema, props)).name("RocketMQ Source") } /** * 构建RocketMQ拉取消息的DStream流,仅获取消息体中的key和value * * @param rocketParam * rocketMQ相关消费参数 * @param groupId * groupId * @param topics * topic列表 * @return * rocketMQ DStream */ def createRocketMqPullStreamWithKey(rocketParam: Map[String, String] = null, groupId: String = null, topics: String = null, tag: String = null, keyNum: Int = 1): DataStream[(String, String)] = { this.createRocketMqPullStreamWithTag(rocketParam, groupId, topics, tag, keyNum).map(t => (t._2, t._3)) } /** * 构建RocketMQ拉取消息的DStream流,仅获取消息体中的value * * @param rocketParam * rocketMQ相关消费参数 * @param groupId * groupId * @param topics * topic列表 * @return * rocketMQ DStream */ def createRocketMqPullStream(rocketParam: Map[String, String] = null, groupId: String = null, topics: String = null, tag: String = null, keyNum: Int = 1): DataStream[String] = { this.createRocketMqPullStreamWithTag(rocketParam, groupId, topics, tag, keyNum).map(t => t._3) } /** * 执行sql query操作 * * @param sql * sql语句 * @return * table对象 */ def sqlQuery(sql: String): Table = { SQLUtils.executeSql(sql) ( statement => this.tableEnv.sqlQuery(FlinkUtils.sqlWithReplace(statement))).get } /** * 执行sql语句 * 支持DDL、DML */ def sql(sql: String): TableResult = { SQLUtils.executeSql(sql) { statement => if (this.isInsertStatement(statement)) { FlinkSqlExtensionsParser.sqlParse(statement) this.addInsertSql(statement) // 为兼容flink1.12,使用反射调用返回TABLE_RESULT_OK val tableResultClass = Class.forName("org.apache.flink.table.api.internal.TableResultImpl") val field = tableResultClass.getDeclaredField("TABLE_RESULT_OK") field.setAccessible(true) field.get(null).asInstanceOf[TableResult] } else { val finalSql = FlinkUtils.sqlWithReplace(statement) FlinkSqlExtensionsParser.sqlParse(finalSql) this.tableEnv.executeSql(finalSql) } }.get } /** * 创建并返回StatementSet对象实例 */ def createStatementSet: StatementSet = StreamExecutionEnvExt.createStatementSet /** * 使用正则匹配执行的sql语句是否为insert语句 */ private[this] def isInsertStatement(sql: String): Boolean = { RegularUtils.insertReg.findFirstIn(sql.toUpperCase).isDefined } /** * 将待执行的sql sink语句加入到StatementSet中 * * @param sql * insert xxx语句 * @return * StatementSet */ def addInsertSql(sql: String): StatementSet = { StreamExecutionEnvExt.useStatementSet.compareAndSet(false, true) SQLUtils.executeSql(sql) (sql => StreamExecutionEnvExt.statementSet.addInsertSql(sql)).get } /** * addInsertSql方法的别名,将待执行的sql sink语句加入到StatementSet中 * * @param sql * insert xxx语句 * @return * StatementSet */ def sqlSink(sql: String): StatementSet = { this.addInsertSql(sql) } /** * addInsertSql方法的别名,将待执行的sql sink语句加入到StatementSet中 * * @param sql * insert xxx语句 * @return * StatementSet */ def sqlInsert(sql: String): StatementSet = this.addInsertSql(sql) /** * 将table sink加入到StatementSet中 */ def addInsert(targetPath: String, table: Table, overwrite: Boolean = false): StatementSet = { StreamExecutionEnvExt.useStatementSet.compareAndSet(false, true) StreamExecutionEnvExt.statementSet.addInsert(targetPath, table, overwrite) } /** * 使用集合元素创建DataStream * * @param seq * 元素集合 * @tparam T * 元素的类型 */ def parallelize[T: TypeInformation](seq: Seq[T]): DataStream[T] = { this.env.fromCollection[T](seq) } /** * 使用集合元素创建DataStream * * @param seq * 元素集合 * @tparam T * 元素的类型 */ def createCollectionStream[T: TypeInformation](seq: Seq[T]): DataStream[T] = this.env.fromCollection[T](seq) /** * 提交job执行 * * @param jobName * job名称 */ def startAwaitTermination(jobName: String = FlinkSingletonFactory.getAppName): Any = { if (StreamExecutionEnvExt.useStatementSet.get()) StreamExecutionEnvExt.statementSet.execute() else this.env.execute(jobName) } /** * 提交Flink Streaming Graph并执行 */ def start(jobName: String): Any = this.startAwaitTermination(jobName) /** * 流的启动 */ override def start: Any = this.startAwaitTermination() } private[fire] object StreamExecutionEnvExt { private[fire] lazy val statementSet = this.createStatementSet private[fire] lazy val useStatementSet = new AtomicBoolean(false) /** * 创建并返回StatementSet对象实例 */ def createStatementSet: StatementSet = FlinkSingletonFactory.getTableEnv.createStatementSet() } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/ext/stream/TableEnvExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.ext.stream import com.zto.fire.common.conf.FireHiveConf import com.zto.fire.flink.conf.FireFlinkConf import com.zto.fire.flink.util.FlinkSingletonFactory import com.zto.fire.noEmpty import org.apache.flink.table.api.{SqlDialect, Table, TableEnvironment} import org.apache.flink.table.catalog.Catalog import org.apache.flink.table.functions.ScalarFunction import java.util.Optional /** * 用于对Flink StreamTableEnvironment的API库扩展 * * @author ChengLong 2020年1月7日 09:18:21 * @since 0.4.1 */ class TableEnvExt(tableEnv: TableEnvironment) extends TableApi { // 获取hive catalog lazy val hiveCatalog = this.getHiveCatalog /** * 尝试获取注册的hive catalog对象 */ private def getHiveCatalog: Optional[Catalog] = { // 如果使用fire框架,则通过指定的hive catalog名称获取catalog实例 val catalog = this.tableEnv.getCatalog(FireHiveConf.hiveCatalogName) if (catalog.isPresent) catalog else { // 如果fire未使用fire框架,尝试获取名称包含hive的catalog val hiveCatalogName = this.tableEnv.listCatalogs().filter(_.contains("hive")) if (noEmpty(hiveCatalogName)) this.tableEnv.getCatalog(hiveCatalogName(0)) else Optional.empty() } } } trait TableApi { private lazy val tableEnv = FlinkSingletonFactory.getTableEnv // 获取默认的catalog lazy val defaultCatalog = this.tableEnv.getCatalog(FireFlinkConf.defaultCatalogName) /** * 注册自定义udf函数 * * @param name * 函数名 * @param function * 函数的实例 */ def udf(name: String, function: ScalarFunction): Unit = { this.tableEnv.registerFunction(name, function) } /** * 用于判断当前是否hive catalog */ def isHiveCatalog: Boolean = this.tableEnv.getCurrentCatalog.toUpperCase.contains("HIVE") /** * 用于判断当前是否为默认的catalog */ def isDefaultCatalog: Boolean = !this.isHiveCatalog /** * 使用hive catalog */ def useHiveCatalog(hiveCatalog: String = FireHiveConf.hiveCatalogName): Unit = { this.tableEnv.useCatalog(hiveCatalog) this.tableEnv.getConfig.setSqlDialect(SqlDialect.HIVE) } /** * 使用默认的catalog */ def useDefaultCatalog: Unit = { this.tableEnv.useCatalog(FireFlinkConf.defaultCatalogName) this.tableEnv.getConfig.setSqlDialect(SqlDialect.DEFAULT) } /** * 获取当前catalog */ def getCurrentCatalog: String = this.tableEnv.getCurrentCatalog /** * 创建临时视图 */ def createTemporaryView(path: String, view: Table): Unit = { this.tableEnv.createTemporaryView(path, view) } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/ext/stream/TableExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.ext.stream import com.zto.fire.flink.bean.FlinkTableSchema import com.zto.fire.flink.sink.HBaseSink import com.zto.fire.flink.util.FlinkSingletonFactory import com.zto.fire.hbase.HBaseConnector import com.zto.fire.hbase.bean.HBaseBaseBean import org.apache.flink.api.scala._ import org.apache.flink.streaming.api.datastream.DataStreamSink import org.apache.flink.streaming.api.scala.DataStream import org.apache.flink.table.api.{Table, TableEnvironment} import org.apache.flink.table.api.bridge.scala._ import org.apache.flink.types.Row import scala.collection.mutable.ListBuffer import scala.reflect.ClassTag /** * 用于flink StreamTable API库扩展 * * @author ChengLong 2020年1月9日 13:52:16 * @since 0.4.1 */ class TableExt(table: Table) { lazy val streamTableEnv = FlinkSingletonFactory.getTableEnv.asInstanceOf[StreamTableEnvironment] lazy val batchTableEnv = FlinkSingletonFactory.getTableEnv.asInstanceOf[TableEnvironment] /** * 逐条打印每行记录 */ def show(): Unit = { this.table.addSink(row => println(row)) } /** * 获取表的schema包装类,用于flinkRowToBean * * @return * fire包装后的表schema信息 */ def getTableSchema: FlinkTableSchema = { new FlinkTableSchema(table.getSchema) } /** * 将Table转为追加流 */ def toAppendStream[T]: DataStream[Row] = { this.streamTableEnv.toAppendStream[Row](this.table) } /** * 将Table转为Retract流 */ def toRetractStream[T]: DataStream[(Boolean, Row)] = { this.streamTableEnv.toRetractStream[Row](this.table) } /** * 将Table转为DataSet */ /*def toDataSet[T]: DataSet[Row] = { require(this.batchTableEnv != null) this.batchTableEnv.toDataSet[Row](this.table) }*/ /** * 将流注册为临时表 * * @param tableName * 临时表的表名 */ def createOrReplaceTempView(tableName: String): Table = { if (this.streamTableEnv != null) { this.streamTableEnv.createTemporaryView(tableName, table) } else if (this.batchTableEnv != null) { this.batchTableEnv.createTemporaryView(tableName, table) } else { throw new NullPointerException("table environment对象实例为空,请检查") } table } /** * 将table映射为Retract流,仅保留新增数据和变更数据,忽略变更前为false的数据 */ def toRetractStreamSingle: DataStream[Row] = { this.table.toRetractStream[Row].filter(t => t._1).map(t => t._2) } /** * table的jdbc批量sink操作,根据用户指定的Row中字段的顺序,依次填充到sql中的占位符所对应的位置 * 注: * 1. Row中的字段顺序要与sql中占位符顺序保持一致,数量一致 * 2. 目前仅处理Retract中的true消息,用户需手动传入merge语句 * * @param sql * 增删改sql * @param batch * 每次sink最大的记录数 * @param flushInterval * 多久flush一次(毫秒) * @param keyNum * 配置文件中的key后缀 */ def jdbcBatchUpdate(sql: String, batch: Int = 10, flushInterval: Long = 1000, isMerge: Boolean = true, keyNum: Int = 1): DataStreamSink[Row] = { this.jdbcBatchUpdate2(sql, batch, flushInterval, isMerge, keyNum) { row => { val param = ListBuffer[Any]() for (i <- 0 until row.getArity) { param += row.getField(i) } param } } } /** * table的jdbc批量sink操作,该api需用户定义row的取数规则,并与sql中的占位符对等 * * @param sql * 增删改sql * @param batch * 每次sink最大的记录数 * @param flushInterval * 多久flush一次(毫秒) * @param keyNum * 配置文件中的key后缀 */ def jdbcBatchUpdate2(sql: String, batch: Int = 10, flushInterval: Long = 1000, isMerge: Boolean = true, keyNum: Int = 1)(fun: Row => Seq[Any]): DataStreamSink[Row] = { import com.zto.fire._ if (!isMerge) throw new IllegalArgumentException("该jdbc sink api暂不支持非merge语义,delete操作需单独实现") this.table.toRetractStreamSingle.jdbcBatchUpdate2(sql, batch, flushInterval, keyNum) { row => fun(row) }.name("fire jdbc sink") } /** * table的hbase批量sink操作,该api需用户定义row的取数规则,并映射到对应的HBaseBaseBean的子类中 * * @param tableName * HBase表名 * @param batch * 每次sink最大的记录数 * @param flushInterval * 多久flush一次(毫秒) * @param keyNum * 配置文件中的key后缀 */ def hbasePutTable[T <: HBaseBaseBean[T]: ClassTag](tableName: String, batch: Int = 100, flushInterval: Long = 3000, keyNum: Int = 1): DataStreamSink[_] = { import com.zto.fire._ this.table.hbasePutTable2[T](tableName, batch, flushInterval, keyNum) { val schema = table.getTableSchema row => { // 将row转为clazz对应的JavaBean val hbaseBean = row.rowToBean(schema, getParamType[T]) if (!hbaseBean.isInstanceOf[HBaseBaseBean[T]]) throw new IllegalArgumentException("clazz参数必须是HBaseBaseBean的子类") hbaseBean } } } /** * table的hbase批量sink操作,该api需用户定义row的取数规则,并映射到对应的HBaseBaseBean的子类中 * * @param tableName * HBase表名 * @param batch * 每次sink最大的记录数 * @param flushInterval * 多久flush一次(毫秒) * @param keyNum * 配置文件中的key后缀 */ def hbasePutTable2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, batch: Int = 100, flushInterval: Long = 3000, keyNum: Int = 1)(fun: Row => T): DataStreamSink[_] = { import com.zto.fire._ HBaseConnector.checkClass[T]() this.table.toRetractStreamSingle.addSink(new HBaseSink[Row, T](tableName, batch, flushInterval, keyNum) { override def map(value: Row): T = fun(value) }).name("fire hbase sink") } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/ext/stream/TableResultImplExt.scala ================================================ package com.zto.fire.flink.ext.stream import org.apache.flink.table.api.TableResult /** * 用于对Flink TableResult的API库扩展 * * @author ChengLong 2020年1月7日 09:18:21 * @since 2.1.0 */ class TableResultImplExt(tableResult: TableResult) { /** * 打印执行结果 */ def show(): Unit = this.tableResult.print() } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/plugin/FlinkArthasLauncher.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.plugin import com.zto.fire.common.util.OSUtils import com.zto.fire.predef.{noEmpty, _} import com.zto.fire.core.plugin.{ArthasLauncher, ArthasManager} import com.zto.fire.flink.util.FlinkUtils /** * Flink Arthas分布式启动器 * * @author ChengLong 2021-11-11 10:48:55 * @since 2.2.0 */ private[fire] class FlinkArthasLauncher extends ArthasLauncher { /** * 用于判断是否符合启动Arthas的条件 * * @param ip * 用于指定在指定的ip上启动Arthas * @return * true: 可以启动 false:不应当启动 */ private[this] def canDo(isDistribute: Boolean, ip: String): Boolean = { if (FlinkUtils.isJobManager) return true if (isDistribute && FlinkUtils.isTaskManager && (isEmpty(ip) || (noEmpty(ip) && ip.contains(OSUtils.getIp)))) true else false } /** * 热启动Arthas * * @param isDistribute * 是否在每个container端启动arthas * @param ip * 仅在某些主机上启动 */ override def hotStart(isDistribute: Boolean, ip: String): Unit = if (this.canDo(isDistribute, ip)) ArthasManager.startArthas(FlinkUtils.getResourceId) /** * 分布式热关闭Arthas相关服务 * * @param isDistribute * 是否在每个container端停止arthas * @param ip * 仅在某些主机上启动 */ override def hotStop(isDistribute: Boolean, ip: String): Unit = if (this.canDo(isDistribute, ip)) ArthasManager.stopArthas /** * 分布式热重启rthas相关服务 * * @param isDistribute * 是否在每个container端停止arthas * @param ip * 仅在某些主机上启动 */ override def hotRestart(isDistribute: Boolean, ip: String): Unit = if (this.canDo(isDistribute, ip)) ArthasManager.restartArthas(FlinkUtils.getResourceId) } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/rest/FlinkSystemRestful.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.rest import com.zto.fire.common.anno.Rest import com.zto.fire.common.bean.rest.ResultMsg import com.zto.fire.common.enu.{Datasource, ErrorCode, RequestMethod} import com.zto.fire.common.util._ import com.zto.fire.core.rest.{RestCase, RestServerManager, SystemRestful} import com.zto.fire.flink.BaseFlink import com.zto.fire.flink.bean.{CheckpointParams, DistributeBean} import com.zto.fire.flink.enu.DistributeModule import com.zto.fire.flink.sync.FlinkLineageAccumulatorManager import com.zto.fire.predef._ import org.apache.commons.lang3.StringUtils import org.apache.flink.runtime.checkpoint.CheckpointCoordinator import spark._ /** * 系统预定义的restful服务,为Flink计算引擎提供接口服务 * * @author ChengLong 2020年4月2日 13:50:01 */ private[fire] class FlinkSystemRestful(var baseFlink: BaseFlink, val restfulRegister: RestServerManager) extends SystemRestful(baseFlink) { private var distributeJson = "" /** * 注册Flink引擎restful接口 */ override protected def register: Unit = { this.restfulRegister .addRest(RestCase(RequestMethod.GET.toString, s"/system/kill", kill)) .addRest(RestCase(RequestMethod.GET.toString, s"/system/datasource", datasource)) .addRest(RestCase(RequestMethod.GET.toString, s"/system/lineage", lineage)) .addRest(RestCase(RequestMethod.POST.toString, s"/system/checkpoint", checkpoint)) .addRest(RestCase(RequestMethod.GET.toString, s"/system/distributeSync", distributeSync)) .addRest(RestCase(RequestMethod.POST.toString, s"/system/setConf", setConf)) .addRest(RestCase(RequestMethod.POST.toString, s"/system/arthas", arthas)) .addRest(RestCase(RequestMethod.GET.toString, s"/system/exception", exception)) .addRest(RestCase(RequestMethod.POST.toString, s"/system/collectLineage", collectLineage)) } /** * 设置baseFlink实例 */ private[fire] def setBaseFlink(baseFlink: BaseFlink): Unit = this.baseFlink = baseFlink /** * 启用Arthas进行性能诊断 * */ @Rest("/system/arthas") override def arthas(request: Request, response: Response): AnyRef = { this.logger.info(s"Ip address ${request.ip()} request /system/arthas") val retVal = super.arthas(request, response) val json = request.body() if (JSONUtils.getValue[Boolean](json, "distribute", false)) { this.distributeJson = JSONUtils.toJSONString(new DistributeBean(DistributeModule.ARTHAS, request.body)) this.logger.info("开始分布式分发:" + this.distributeJson) } retVal } /** * 用于引擎内部分布式采集血缘信息 */ @Rest("/system/collectLineage") def collectLineage(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.debug(s"内部请求分布式更新血缘信息,ip:${request.ip()}") this.logger.debug(s"请求fire更新血缘信息:$json") val lineageMap = JSONUtils.parseObject[JConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]]](json) if (ValueUtils.noEmpty(lineageMap)) { FlinkLineageAccumulatorManager.add(lineageMap) } ResultMsg.buildSuccess("血缘信息已更新", ErrorCode.SUCCESS.toString) } catch { case e: Exception => { this.logger.error(s"[collectLineage] 设置血缘信息失败:json=$json", e) ResultMsg.buildError("设置血缘信息失败", ErrorCode.ERROR) } } } /** * 用于引擎内部分布式同步信息 */ @Rest("/system/distributeSync") def distributeSync(request: Request, response: Response): AnyRef = { this.logger.debug(s"内部请求分布式更新信息,ip:${request.ip()}") this.distributeJson } /** * 用于更新配置信息 */ @Rest("/system/setConf") def setConf(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/setConf") this.logger.info(s"请求fire更新配置信息:$json") val confMap = JSONUtils.parseObject[JHashMap[String, String]](json) if (ValueUtils.noEmpty(confMap)) { PropUtils.setProperties(confMap) this.distributeJson = JSONUtils.toJSONString(new DistributeBean(DistributeModule.CONF, json)) } ResultMsg.buildSuccess("配置信息已更新", ErrorCode.SUCCESS.toString) } catch { case e: Exception => { this.logger.error(s"[setConf] 设置配置信息失败:json=$json", e) ResultMsg.buildError("设置配置信息失败", ErrorCode.ERROR) } } } /** * 用于运行时热修改checkpoint */ @Rest("/system/checkpoint") def checkpoint(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/checkpoint") val checkpointParams = JSONUtils.parseObject[CheckpointParams](json) val clazz = classOf[CheckpointCoordinator] // 获取静态方法 val getInstance = ReflectionUtils.getMethodByName(clazz, "getInstance") if (getInstance != null) { // 获取CheckpointCoordinator单例对象 val coordinator = getInstance.invoke(null) if (coordinator != null) { val target = coordinator.asInstanceOf[CheckpointCoordinator] // 重新设置checkpoint的频率 if (checkpointParams.getInterval != null) ReflectionUtils.getMethodByName(clazz, "setBaseInterval").invoke(target, checkpointParams.getInterval) // 重新设置checkpoint的超时时间 if (checkpointParams.getTimeout != null) ReflectionUtils.getMethodByName(clazz, "setCheckpointTimeout").invoke(target, checkpointParams.getTimeout) // 重新设置两次相邻checkpoint的最短时间间隔 if (checkpointParams.getMinPauseBetween != null) ReflectionUtils.getMethodByName(clazz, "setMinPauseBetweenCheckpoints").invoke(target, checkpointParams.getMinPauseBetween) // 重新调度checkpoint target.startCheckpointScheduler() } } this.logger.info(s"[checkpoint] 执行checkpoint热修改成功:interval=${checkpointParams.getInterval} timeout=${checkpointParams.getTimeout} minPauseBetween=${checkpointParams.getMinPauseBetween} json=$json", "rest") ResultMsg.buildSuccess(s"执行checkpoint热修改成功:interval=${checkpointParams.getInterval} timeout=${checkpointParams.getTimeout} minPauseBetween=${checkpointParams.getMinPauseBetween}", ErrorCode.SUCCESS.toString) } catch { case e: Exception => { this.logger.error(s"[checkpoint] 执行checkpoint热修改失败:json=$json", e) ResultMsg.buildError("执行checkpoint热修改失败", ErrorCode.ERROR) } } } /** * kill 当前 Flink 任务 */ @Rest("/system/kill") def kill(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/kill") // 参数校验与参数获取 this.baseFlink.shutdown() this.logger.info(s"[kill] kill任务成功:json=$json") ResultMsg.buildSuccess("任务停止成功", ErrorCode.SUCCESS.toString) } catch { case e: Exception => { this.logger.error(s"[kill] 执行kill任务失败:json=$json", e) ResultMsg.buildError("执行kill任务失败", ErrorCode.ERROR) } } } /** * 用于执行sql语句 */ @Rest(value = "/system/sql", method = "post") def sql(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/sql") // 参数校验与参数获取 val sql = JSONUtils.getValue(json, "sql", "") // sql合法性检查 if (StringUtils.isBlank(sql) || !sql.toLowerCase.trim.startsWith("select ")) { this.logger.warn(s"[sql] sql不合法,在线调试功能只支持查询操作:json=$json") return ResultMsg.buildError(s"sql不合法,在线调试功能只支持查询操作", ErrorCode.ERROR) } if (this.baseFlink == null) { this.logger.warn(s"[sql] 系统正在初始化,请稍后再试:json=$json") return "系统正在初始化,请稍后再试" } "" } catch { case e: Exception => { this.logger.error(s"[sql] 执行用户sql失败:json=$json", e) ResultMsg.buildError("执行用户sql失败,异常堆栈:" + ExceptionBus.stackTrace(e), ErrorCode.ERROR) } } } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/sql/FlinkSqlExtensionsParser.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.sql import com.zto.fire.common.conf.FireFrameworkConf.lineageEnable import com.zto.fire.common.util.SQLLineageManager import com.zto.fire.core.sql.SqlExtensionsParser import com.zto.fire.flink.util.FlinkUtils import com.zto.fire.predef.noEmpty /** * flink sql解析器 * * @author ChengLong * @date 2022-05-10 10:03:52 * @since 2.2.2 */ private[fire] object FlinkSqlExtensionsParser extends SqlExtensionsParser { /** * 将待解析的SQL添加到buffer中 */ def sqlParse(sql: String): Unit = { if (lineageEnable && noEmpty(sql)) { FlinkUtils.sqlValidate(sql) FlinkSqlParser.sqlParse(sql) } } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/sql/FlinkSqlParserBase.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.sql import com.zto.fire._ import com.zto.fire.common.anno.Internal import com.zto.fire.common.bean.TableIdentifier import com.zto.fire.common.conf.{FireHiveConf, FireKafkaConf, FireRocketMQConf} import com.zto.fire.common.enu.{Datasource, Operation} import com.zto.fire.common.util.{LineageManager, ReflectionUtils, SQLLineageManager} import com.zto.fire.core.sql.SqlParser import com.zto.fire.flink.conf.FireFlinkConf import com.zto.fire.flink.util.{FlinkSingletonFactory, FlinkUtils} import com.zto.fire.jdbc.conf.FireJdbcConf import org.apache.calcite.sql._ import org.apache.flink.sql.parser.SqlProperty import org.apache.flink.sql.parser.ddl._ import org.apache.flink.sql.parser.dml._ import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment import org.apache.flink.table.catalog.ObjectPath import org.apache.flink.table.catalog.hive.HiveCatalog import org.apache.hadoop.hive.metastore.api.Table import scala.collection.JavaConversions /** * Flink SQL解析器,用于解析Flink SQL语句中的库、表、分区、操作类型等信息 * * @author ChengLong 2021-6-18 16:41:04 * @since 2.0.0 */ @Internal private[fire] trait FlinkSqlParserBase extends SqlParser { // calcite parser config protected lazy val tableEnv = FlinkSingletonFactory.getTableEnv.asInstanceOf[StreamTableEnvironment] protected lazy val hiveTableMetaDataMap = new JConcurrentHashMap[String, Table]() /** * 用于解析给定的SQL语句 */ override def sqlParser(sql: String): Unit = { try { FlinkUtils.sqlParser(sql) match { case select: SqlSelect => this.parseSqlNode(select) case insert: RichSqlInsert => { this.parseSqlNode(insert.getTargetTable, Operation.INSERT_INTO) this.parsePartitions(insert.getTargetTable.asInstanceOf[SqlIdentifier], Seq(insert.getStaticPartitions)) this.parseSqlNode(insert.getSource, Operation.SELECT, targetTable = Some(insert.getTargetTable)) } case createView: SqlCreateView => { this.parseSqlNode(createView.getViewName, Operation.CREATE_VIEW) this.parseSqlNode(createView.getQuery, Operation.SELECT) } case createTable: SqlCreateTable => parseCreateTable(createTable) case _ => this.hiveSqlParser(sql) } } catch { case e: Throwable => this.hiveSqlParser(sql) } } /** * 用于解析Hive SQL */ @Internal protected def hiveSqlParser(sql: String): Unit = { FlinkUtils.sqlParser(sql, FlinkUtils.calciteHiveParserConfig) match { case sqlAddPartitions: SqlAddPartitions => { this.parseSqlNode(sqlAddPartitions.getTableName, Operation.ADD_PARTITION, true) this.parsePartitions(sqlAddPartitions.getTableName, sqlAddPartitions.getPartSpecs) } case sqlDropPartitions: SqlDropPartitions => { this.parseSqlNode(sqlDropPartitions.getTableName, Operation.DROP_PARTITION, true) this.parsePartitions(sqlDropPartitions.getTableName, sqlDropPartitions.getPartSpecs) } case sqlDropTable: SqlDropTable => this.parseSqlNode(sqlDropTable.getTableName, Operation.DROP_TABLE, true) case sqlDropDatabase: SqlDropDatabase => this.parseSqlNode(sqlDropDatabase.getDatabaseName, Operation.DROP_DATABASE) case sqlAlterTable: SqlAlterTable => this.parseSqlNode(sqlAlterTable.getTableName, Operation.ALTER_TABLE, true) case sqlCreateDatabase: SqlCreateDatabase => this.parseSqlNode(sqlCreateDatabase.getDatabaseName, Operation.CREATE_DATABASE, true) case sqlAlterTableRename: SqlAlterTableRename => this.parseSqlNode(sqlAlterTableRename.getTableName, Operation.RENAME_TABLE_OLD, true) case sqlCreateTable: SqlCreateTable => this.parseHiveCreateTable(sqlCreateTable) case _ => this.logger.info(s"可忽略异常:实时血缘解析SQL报错,SQL:\n$sql") } } /** * 解析查询SQL中的SqlNode */ @Internal protected def parseSqlNode(sqlNode: SqlNode, operation: Operation = Operation.SELECT, isHive: Boolean = false, targetTable: Option[SqlNode] = None): Unit = { sqlNode match { case select: SqlSelect => this.parseSqlNode(select.getFrom, operation, isHive, targetTable) case sqlJoin: SqlJoin => { this.parseSqlNode(sqlJoin.getLeft, operation, isHive, targetTable) this.parseSqlNode(sqlJoin.getRight, operation, isHive, targetTable) } case sqlBasicCall: SqlBasicCall => { sqlBasicCall.operands.foreach(sqlNode => { // 过滤掉别名 if (sqlNode.isInstanceOf[SqlIdentifier]) { val sqlIdentifier = sqlNode.asInstanceOf[SqlIdentifier] val componentPositions = ReflectionUtils.getFieldByName(sqlIdentifier.getClass, "componentPositions") if (componentPositions.get(sqlIdentifier) == null) return } if (sqlNode.isInstanceOf[SqlSnapshot]) { this.parseSqlNode(sqlNode.asInstanceOf[SqlSnapshot].getTableRef, operation, isHive, targetTable) } this.parseSqlNode(sqlNode, operation, isHive, targetTable) }) } case sqlIdentifier: SqlIdentifier => { val tableIdentifier = toFireTableIdentifier(sqlIdentifier, isHive) this.addCatalog(tableIdentifier, operation) if (targetTable.isDefined) { SQLLineageManager.addRelation(tableIdentifier, TableIdentifier(targetTable.get.toString)) } } case sqlNodeList: SqlNodeList => JavaConversions.asScalaBuffer(sqlNodeList.getList).foreach(this.parseSqlNode(_)) case sqlTableLike: SqlTableLike => this.parseSqlNode(sqlTableLike.getSourceTable, operation, isHive, targetTable) case _ => } } /** * 移除表的catalog名称 */ protected def replaceCatalogName(tableName: String): String = { tableName.replace(FireHiveConf.hiveCatalogName + ".", "").replace(FireFlinkConf.defaultCatalogName + ".", "") } /** * 将Fire的TableIdentifier转为Flink的ObjectPath对象 * * @param isHiveTable * 如果是hive表,则默认的数据库名称从配置文件中获取,否则从env中获取默认的数据库名称 */ @Internal protected def toFlinkTableIdentifier(tableIdentifier: TableIdentifier, isHiveTable: Boolean = false): ObjectPath = { val db = if (noEmpty(tableIdentifier.database)) tableIdentifier.database else if (isHiveTable) FireHiveConf.defaultDB else this.tableEnv.defaultCatalog.get().getDefaultDatabase new ObjectPath(db, tableIdentifier.table) } /** * 将Flink的ObjectPath对象转为Fire的TableIdentifier */ @Internal protected def toFireTableIdentifier(objectPath: ObjectPath): TableIdentifier = { val db = if (noEmpty(objectPath.getDatabaseName)) objectPath.getDatabaseName else this.tableEnv.defaultCatalog.get().getDefaultDatabase TableIdentifier(db, objectPath.getObjectName) } /** * 将Flink的SqlIdentifier转为Fire的TableIdentifier */ @Internal protected def toFireTableIdentifier(sqlIdentifier: SqlIdentifier, isHive: Boolean): TableIdentifier = { val tableName = this.replaceCatalogName(sqlIdentifier.toString.toLowerCase) if (isHive) this.toFireHiveTableIdentifier(TableIdentifier(tableName)) else TableIdentifier(tableName) } /** * 补全hive表所在的数据库信息 */ @Internal protected def toFireHiveTableIdentifier(tableIdentifier: TableIdentifier): TableIdentifier = { val db = if (tableIdentifier.notExistsDB) FireHiveConf.defaultDB else tableIdentifier.database TableIdentifier(tableIdentifier.table, db) } /** * 用于判断给定的表是否为临时表 */ override def isTempView(tableIdentifier: TableIdentifier): Boolean = { if (this.tableEnv.defaultCatalog.isPresent) { val catalog = this.tableEnv.defaultCatalog.get() catalog.tableExists(this.toFlinkTableIdentifier(tableIdentifier)) } else { false } } /** * 获取Hive表元数据信息 */ @Internal protected def getHiveTable(tableIdentifier: TableIdentifier): Option[Table] = { if (!this.tableEnv.hiveCatalog.isPresent) return None // 获取hive表所在的数据库名称 val hiveTableIdentifier = if (tableIdentifier.notExistsDB) TableIdentifier(tableIdentifier.table, FireHiveConf.defaultDB) else tableIdentifier val hiveTable = this.hiveTableMetaDataMap.mergeGet(hiveTableIdentifier.identifier) { this.tableEnv.hiveCatalog.get().asInstanceOf[HiveCatalog].getHiveTable(this.toFlinkTableIdentifier(hiveTableIdentifier, true)) } Some(hiveTable) } /** * 用于判断给定的表是否为hive表 */ @Internal override def isHiveTable(tableIdentifier: TableIdentifier): Boolean = { this.hiveTableMap.mergeGet(tableIdentifier.identifier) { tryWithReturn { if (this.tableEnv.hiveCatalog.isPresent) { val hiveCatalog = this.tableEnv.hiveCatalog.get().asInstanceOf[HiveCatalog] if (tableIdentifier.notExistsDB) { hiveCatalog.tableExists(this.toFlinkTableIdentifier(TableIdentifier(tableIdentifier.identifier, FireHiveConf.defaultDB))) } else { hiveCatalog.tableExists(this.toFlinkTableIdentifier(tableIdentifier)) } } else false } (this.logger, catchLog = s"判断${tableIdentifier}是否为hive表失败", hook = false) } } /** * 用于判断给定的表是否为hive表 * * @param tableIdentifier 库表 */ @Internal protected def getCatalog(tableIdentifier: TableIdentifier): Datasource = { val isHive = this.isHiveTable(tableIdentifier) if (isHive) Datasource.HIVE else Datasource.VIEW } /** * 将解析到的表信息添加到实时血缘中 */ @Internal protected def addCatalog(identifier: TableIdentifier, operation: Operation): Unit = { SQLLineageManager.setOperation(identifier, operation.toString) // Flink临时表血缘解析 if (this.isTempView(identifier)) { SQLLineageManager.setCatalog(identifier, this.getCatalog(identifier).toString) SQLLineageManager.setTmpView(identifier, identifier.toString()) } // Hive表血缘解析 if (this.isHiveTable(identifier)) { val hiveTable = this.getHiveTable(identifier) if (hiveTable.isDefined) { val hive = hiveTable.get // 获取hive表额外信息 val tableIdentifier = TableIdentifier(identifier.toString, hive.getDbName) SQLLineageManager.setPhysicalTable(tableIdentifier, tableIdentifier.toString) SQLLineageManager.setTmpView(tableIdentifier, tableIdentifier.toString) SQLLineageManager.setCatalog(tableIdentifier, this.getCatalog(identifier).toString) if (hive.getSd != null) { // 获取表存储路径 SQLLineageManager.setCluster(tableIdentifier, hive.getSd.getLocation) // 获取字段列表 if (hive.getSd.getCols.nonEmpty) { val fields = hive.getSd.getCols.map(schema => (schema.getName, schema.getType)) SQLLineageManager.setColumns(tableIdentifier, fields) } } // 获取分区列表 if (hive.getPartitionKeys.nonEmpty) { val partitions = hive.getPartitionKeys.map(schema => (schema.getName, schema.getType)) SQLLineageManager.setPartitions(tableIdentifier, partitions) } } } } /** * SQL语法校验 * * @param sql * sql statement * @return * true:校验成功 false:校验失败 */ override def sqlLegal(sql: JString): Boolean = FlinkUtils.sqlLegal(sql) /** * 解析SQL中的分区信息 */ @Internal protected def parsePartitions(sqlIdentifier: SqlIdentifier, partitionsNode: Seq[SqlNodeList]): Unit = { val tableIdentifier = this.toFireTableIdentifier(sqlIdentifier, true) val partitions = partitionsNode.flatMap(sqlNodeList => sqlNodeList.getList.map(sqlNode => sqlNode.asInstanceOf[SqlProperty])).map(partitionNode => partitionNode.getKeyString -> partitionNode.getValueString).toMap if (partitions.nonEmpty) { SQLLineageManager.setPartitions(tableIdentifier, partitions.toSeq) } } /** * 解析flink create table语句 */ @Internal protected def parseCreateTable(createTable: SqlCreateTable): Unit = { // create table语句 val tableIdentifier = this.toFireTableIdentifier(createTable.getTableName, false) SQLLineageManager.setTmpView(tableIdentifier, tableIdentifier.identifier) this.parseSqlNode(createTable.getTableName, Operation.CREATE_TABLE) val tableLike = createTable.getTableLike if (!tableLike.isPresent) { // 解析建表语句中的with参数列表 val properties = this.parseOptions(tableIdentifier, createTable.getPropertyList) // 解析建表语句中的字段列表 this.parseColumns(tableIdentifier, createTable.getColumnList) val catalog = properties.getOrElse("connector", "") if (noEmpty(catalog)) { SQLLineageManager.setCatalog(tableIdentifier, catalog) catalog match { case "kafka" => this.parseKafkaConnector(tableIdentifier, properties) case "jdbc" => this.parseJDBCConnector(tableIdentifier, properties) case "fire-rocketmq" => this.parseRocketMQConnector(tableIdentifier, properties) case _ => } } } else { // create table like语句 this.parseSqlNode(tableLike.get(), Operation.SELECT) } } /** * 解析JDBC数据源 */ @Internal protected def parseJDBCConnector(tableIdentifier: TableIdentifier, properties: Map[JString, JString]): Unit = { val tableName = properties.getOrElse("table-name", "") SQLLineageManager.setPhysicalTable(tableIdentifier, tableName) val url = properties.getOrElse("url", "") SQLLineageManager.setCluster(tableIdentifier, FireJdbcConf.jdbcUrl(url)) val username = properties.getOrElse("username", "") LineageManager.addDBSql(Datasource.JDBC.toString, url, username, "", Operation.CREATE_TABLE, Operation.SELECT) } /** * 解析RocketMQ数据源 */ @Internal protected def parseRocketMQConnector(tableIdentifier: TableIdentifier, properties: Map[JString, JString]): Unit = { val url = properties.getOrElse("rocket.brokers.name", "") SQLLineageManager.setCluster(tableIdentifier, FireRocketMQConf.rocketNameServer(url)) val topic = properties.getOrElse("rocket.topics", "") SQLLineageManager.setPhysicalTable(tableIdentifier, topic) val groupId = properties.getOrElse("rocket.group.id", "") LineageManager.addMQDatasource(Datasource.ROCKETMQ.toString, url, topic, groupId, Operation.CREATE_TABLE, Operation.SOURCE) } /** * 解析kafka数据源 */ @Internal protected def parseKafkaConnector(tableIdentifier: TableIdentifier, properties: Map[JString, JString]): Unit = { val url = properties.getOrElse("properties.bootstrap.servers", "") SQLLineageManager.setCluster(tableIdentifier, FireKafkaConf.kafkaBrokers(url)) val topic = properties.getOrElse("topic", "") SQLLineageManager.setPhysicalTable(tableIdentifier, topic) val groupId = properties.getOrElse("properties.group.id", "") LineageManager.addMQDatasource(Datasource.KAFKA.toString, url, topic, groupId, Operation.CREATE_TABLE, Operation.SOURCE) } /** * 解析hive建表语句 */ @Internal protected def parseHiveCreateTable(sqlCreateTable: SqlCreateTable): Unit = { // 解析表名 val tableIdentifier = toFireHiveTableIdentifier(TableIdentifier(sqlCreateTable.getTableName.toString)) this.addCatalog(tableIdentifier, Operation.CREATE_TABLE) // 解析表注释 if (sqlCreateTable.getComment.isPresent) SQLLineageManager.setComment(tableIdentifier, sqlCreateTable.getComment.get().toString) // 解析使用到的字段列表 this.parseColumns(tableIdentifier, sqlCreateTable.getColumnList) // 解析options信息 this.parseOptions(tableIdentifier, sqlCreateTable.getPropertyList) this.parseSqlNode(sqlCreateTable.getTableName, Operation.CREATE_TABLE, true) } /** * 用于解析sql中的options * @param tableIdentifier * 表名 * @param options * 选项信息 */ @Internal protected def parseOptions(tableIdentifier: TableIdentifier, options: SqlNodeList): Map[String, String] = { val props = options.getList.map(t => t.toString.replace("'", "").split("=")) .filter(t => t.nonEmpty && t.length == 2).map(t => if (t.contains("password")) (t(0).trim, "******") else (t(0).trim, t(1).trim)).toMap SQLLineageManager.setOptions(tableIdentifier, props) props } /** * 解析字段列表信息 * @param tableIdentifier * 表名 * @param columnList * 字段列表 */ @Internal protected def parseColumns(tableIdentifier: TableIdentifier, columnList: SqlNodeList): Unit = { val columns = columnList.toList.map(sqlNode => { sqlNode.toString.replace("`", "").split(" ") }).filter(arr => arr.nonEmpty && arr.length == 2).map(t => (t(0), t(1))) SQLLineageManager.setColumns(tableIdentifier, columns) } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/sync/DistributeSyncManager.scala ================================================ package com.zto.fire.flink.sync import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.enu.ThreadPoolType import com.zto.fire.common.util.{JSONUtils, LineageManager, PropUtils, ThreadUtils} import com.zto.fire.core.bean.ArthasParam import com.zto.fire.core.plugin.ArthasDynamicLauncher import com.zto.fire.core.rest.SystemRestful import com.zto.fire.core.sync.SyncManager import com.zto.fire.flink.bean.DistributeBean import com.zto.fire.flink.conf.FireFlinkConf import com.zto.fire.flink.enu.DistributeModule import com.zto.fire.predef._ import java.util.concurrent.{ScheduledExecutorService, TimeUnit} import java.util.concurrent.atomic.AtomicInteger /** * Flink分布式数据同步管理器,用于将数据从JobManager端同步至每一个TaskManager端 * * @author ChengLong 2021-11-9 13:21:39 * @since 2.2.0 */ private[fire] object DistributeSyncManager extends SyncManager { private var lastJsonConf = "" private lazy val distributeSyncUrl = "/system/distributeSync" private lazy val lineageUrl = "/system/collectLineage" // 用于记录血缘解析运行的次数 private lazy val lineageRunCount = new AtomicInteger() private lazy val lineageThread = ThreadUtils.createThreadPool("LineageSyncThread", ThreadPoolType.SCHEDULED).asInstanceOf[ScheduledExecutorService] /** * 准实时同步最新配置信息 */ def sync: Unit = { ThreadUtils.scheduleWithFixedDelay({ if (!FireFlinkConf.distributeSyncEnabled) return val jsonConf = SystemRestful.restInvoke(this.distributeSyncUrl) if (!this.lastJsonConf.equals(jsonConf)) { if (JSONUtils.isJson(jsonConf)) { val distribute = JSONUtils.parseObject[DistributeBean](jsonConf) distribute.getModule match { case DistributeModule.CONF => this.syncConf(distribute.getJson) case DistributeModule.ARTHAS => ArthasDynamicLauncher.command(JSONUtils.parseObject[ArthasParam](distribute.getJson)) } } this.lastJsonConf = jsonConf } }, 60, 30, TimeUnit.SECONDS) } /** * 同步引擎各个container的信息到累加器中 */ def collect: Unit = { lineageThread.scheduleWithFixedDelay(new Runnable { override def run(): Unit = { val lineageMap = LineageManager.getDatasourceLineage if (noEmpty(lineageMap)) { val json = JSONUtils.toJSONString(lineageMap) SystemRestful.restInvoke(lineageUrl, json) } if (lineageRunCount.incrementAndGet() > FireFrameworkConf.lineageRunCount) { logger.info(s"Flink分布式血缘解析与采集任务即将退出,总计运行:${lineageRunCount.get()}次") lineageThread.shutdown() } logger.info(s"完成Flink分布式血缘解析与采集:${lineageRunCount.get()}次") } }, FireFrameworkConf.lineageRunInitialDelay, FireFrameworkConf.lineageRunPeriod, TimeUnit.SECONDS) } /** * 更新配置信息 */ def syncConf(json: String): Unit = { if (noEmpty(json)) { val confMap = JSONUtils.parseObject[JMap[String, String]](json) PropUtils.setProperties(confMap) this.logger.info(s"本次分布式更新配置数:${confMap.size()}个") } } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/sync/FlinkLineageAccumulatorManager.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.sync import com.zto.fire._ import com.zto.fire.common.bean.lineage.Lineage import com.zto.fire.common.conf.{FireKafkaConf, FireRocketMQConf} import com.zto.fire.common.enu.Datasource import com.zto.fire.common.util._ import com.zto.fire.core.sync.LineageAccumulatorManager import com.zto.fire.hbase.conf.FireHBaseConf import com.zto.fire.jdbc.conf.FireJdbcConf import com.zto.fire.predef.{JConcurrentHashMap, JHashSet} import java.lang.{Boolean => JBoolean} import java.util.concurrent.atomic.AtomicLong /** * 用于将各个TaskManager端数据收集到JobManager端 * * @author ChengLong 2022-08-29 16:29:17 * @since 2.3.2 */ object FlinkLineageAccumulatorManager extends LineageAccumulatorManager { private lazy val lineageMap = new JConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]]() private lazy val counter = new AtomicLong() /** * 去重合并血缘信息 */ private def mergeLineage(lineage: JConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]]): Unit = { // 合并来自各个TaskManager端的血缘信息 if (lineage.nonEmpty) merge(lineage) // 合并血缘管理器中的血缘信息 if (LineageManager.getDatasourceLineage.nonEmpty) merge(LineageManager.getDatasourceLineage) /** * 合并血缘 */ def merge(lineage: JConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]]): Unit = { lineage.foreach(each => { var set = this.lineageMap.get(each._1) if (set == null) set = new JHashSet[DatasourceDesc]() // 兼容jackson反序列化不支持HashSet与case class的问题 if (each._2.isInstanceOf[JArrayList[_]]) { val datasource = each._2.asInstanceOf[JArrayList[JMap[String, _]]] datasource.foreach(map => { if (map.containsKey("datasource")) { val datasource = map.getOrElse("datasource", "").toString.toUpperCase val cluster = map.getOrElse("cluster", "").toString val username = map.getOrElse("username", "").toString val tableName = map.getOrElse("tableName", "").toString val topics = map.getOrElse("topics", "").toString val groupId = map.getOrElse("groupId", "").toString val operationArr = map.getOrElse("operation", "[]").toString.replace("[", "").replace("]", "") val operation = operationArr.split(",").map(operation => com.zto.fire.common.enu.Operation.parse(operation)).toSet Datasource.parse(datasource) match { case Datasource.JDBC => set.add(DBDatasource(datasource, FireJdbcConf.jdbcUrl(cluster), tableName, username, operation = operation)) case Datasource.HBASE => set.add(DBDatasource(datasource, FireHBaseConf.hbaseClusterUrl(cluster), tableName, username, operation = operation)) case Datasource.KAFKA => set.add(MQDatasource(datasource, FireKafkaConf.kafkaBrokers(cluster), topics, groupId, operation = operation)) case Datasource.ROCKETMQ => set.add(MQDatasource(datasource, FireRocketMQConf.rocketNameServer(cluster), topics, groupId, operation = operation)) case _ => } } }) } else { each._2.filter(desc => desc.toString.contains("datasource")).foreach(desc => set.add(desc)) } if (set.nonEmpty) this.lineageMap.put(each._1, set) }) } } /** * 将血缘信息放到累加器中 */ override def add(lineage: JConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]]): Unit = { if (lineage.nonEmpty) this.mergeLineage(lineage) } /** * 累加Long类型数据 */ override def add(value: Long): Unit = this.counter.addAndGet(value) /** * 获取收集到的血缘消息 */ override def getValue: Lineage = { new Lineage(this.lineageMap , SQLLineageManager.getSQLLineage) } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/sync/SyncFlinkEngine.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.sync import com.zto.fire.common.bean.lineage.Lineage import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.enu.Datasource import com.zto.fire.common.util.{DatasourceDesc, ReflectionUtils} import com.zto.fire.core.sync.SyncEngineConf import com.zto.fire.flink.util.FlinkUtils import com.zto.fire.predef._ /** * 获取Spark引擎的所有配置信息 * * @author ChengLong * @since 2.0.0 * @create 2021-03-02 11:12 */ private[fire] class SyncFlinkEngine extends SyncEngineConf { private lazy val globalConfiguration = "org.apache.flink.configuration.GlobalConfiguration" private lazy val environmentInformation = "org.apache.flink.runtime.util.EnvironmentInformation" private lazy val getSettings = "getSettings" /** * 获取Flink引擎的所有配置信息 */ override def syncEngineConf: Map[String, String] = { if (FlinkUtils.isJobManager) { // 如果是JobManager端,则需将flink参数和用户参数进行合并,并从合并后的settings中获取 val clazz = Class.forName(this.globalConfiguration) if (ReflectionUtils.containsMethod(clazz, this.getSettings)) { return clazz.getMethod(this.getSettings).invoke(null).asInstanceOf[JMap[String, String]].toMap } } else if (FlinkUtils.isTaskManager) { // 启动分布式血缘采集 this.collect // 启用分布式同步 DistributeSyncManager.sync // 如果是TaskManager端,则flink会通过EnvironmentInformation将参数进行传递 val clazz = Class.forName(this.environmentInformation) if (ReflectionUtils.containsMethod(clazz, this.getSettings)) { return clazz.getMethod(this.getSettings).invoke(null).asInstanceOf[JMap[String, String]].toMap } } new JHashMap[String, String]().toMap } /** * 在master端获取系统累加器中的数据 */ override def syncLineage: Lineage = { FlinkLineageAccumulatorManager.getValue } /** * 同步引擎各个container的信息到累加器中 */ override def collect: Unit = { if (!FireFrameworkConf.lineageEnable || !this.isCollect.compareAndSet(false, true)) return DistributeSyncManager.collect } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/task/FlinkInternalTask.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.task import com.zto.fire.common.anno.Scheduled import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.util.{JSONUtils, MQProducer} import com.zto.fire.core.task.FireInternalTask import com.zto.fire.flink.BaseFlink import com.zto.fire.flink.sync.FlinkLineageAccumulatorManager /** * 定时任务调度器,用于定时执行Flink框架内部指定的任务 * * @author ChengLong * @since 1.0.0 * @create 2020-07-14 11:04 */ private[fire] class FlinkInternalTask(baseFlink: BaseFlink) extends FireInternalTask(baseFlink) { /** * 实时血缘发送定时任务,定时将血缘信息发送到kafka中 */ @Scheduled(fixedInterval = 60000, initialDelay = 10000, repeatCount = 360) override def lineage: Unit = { sendLineage this.registerLineageHook(sendLineage) def sendLineage: Unit = { if (FireFrameworkConf.lineageEnable && FireFrameworkConf.lineageSendMqEnable) { MQProducer.sendKafka(FireFrameworkConf.lineageMQUrl, FireFrameworkConf.lineageTopic, JSONUtils.toJSONString(FlinkLineageAccumulatorManager.getValue)) } } } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/util/FlinkSingletonFactory.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.util import com.zto.fire.core.util.SingletonFactory import org.apache.flink.api.scala.ExecutionEnvironment import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment import org.apache.flink.table.api.TableEnvironment /** * 单例工厂,用于创建单例的对象 * Created by ChengLong on 2020年1月6日 16:50:56. */ object FlinkSingletonFactory extends SingletonFactory { @transient private[this] var streamEnv: StreamExecutionEnvironment = _ @transient private[this] var env: ExecutionEnvironment = _ @transient private[this] var tableEnv: TableEnvironment = _ /** * 设置TableEnv实例 */ private[fire] def setStreamEnv(env: StreamExecutionEnvironment): this.type = { if (env != null && this.streamEnv == null) this.streamEnv = env this } /** * 设置ExecutionEnvironment实例 */ private[fire] def setEnv(env: ExecutionEnvironment): this.type = { if (env != null && this.env == null) this.env = env this } /** * 设置TableEnv实例 */ private[fire] def setTableEnv(tableEnv: TableEnvironment): this.type = { if (tableEnv != null && this.tableEnv == null) this.tableEnv = tableEnv this } /** * 获取appName * * @return * TableEnv实例 */ private[fire] def getAppName: String = this.appName /** * 获取TableEnv实例 * * @return * TableEnv实例 */ private[fire] def getTableEnv: TableEnvironment = this.tableEnv } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/util/FlinkUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.util import com.google.common.collect.HashBasedTable import com.zto.fire.{JHashMap, JStringBuilder, noEmpty} import com.zto.fire.common.anno.{FieldName, Internal} import com.zto.fire.common.util._ import com.zto.fire.flink.bean.FlinkTableSchema import com.zto.fire.flink.conf.FireFlinkConf import com.zto.fire.flink.sql.FlinkSqlParser import com.zto.fire.hbase.bean.HBaseBaseBean import com.zto.fire.predef._ import org.apache.calcite.avatica.util.{Casing, Quoting} import org.apache.commons.lang3.StringUtils import org.apache.calcite.avatica.util.{Casing, Quoting} import org.apache.calcite.sql.{SqlNodeList, _} import org.apache.flink.table.api.{SqlDialect => FlinkSqlDialect} import org.apache.calcite.sql.parser.{SqlParser => CalciteParser} import org.apache.flink.api.common.ExecutionConfig.ClosureCleanerLevel import org.apache.flink.api.common.{ExecutionConfig, ExecutionMode, InputDependencyConstraint} import org.apache.flink.runtime.util.EnvironmentInformation import org.apache.flink.sql.parser.hive.impl.FlinkHiveSqlParserImpl import org.apache.flink.sql.parser.impl.FlinkSqlParserImpl import org.apache.flink.table.data.binary.BinaryStringData import org.apache.flink.table.data.{DecimalData, GenericRowData, RowData} import org.apache.flink.table.types.logical.RowType import org.apache.flink.types.Row import java.net.{URL, URLClassLoader} import scala.util.Try /** * flink相关工具类 * * @author ChengLong 2020年1月16日 16:28:23 * @since 0.4.1 */ object FlinkUtils extends Serializable with Logging { // 维护schema、fieldName与fieldIndex关系 private[this] val schemaTable = HashBasedTable.create[FlinkTableSchema, String, Int] private var jobManager: Option[Boolean] = None private var mode: Option[String] = None lazy val calciteParserConfig = this.createParserConfig lazy val calciteHiveParserConfig = this.createHiveParserConfig /** * 构建flink default的SqlParser config */ def createParserConfig(dialect: FlinkSqlDialect = FlinkSqlDialect.DEFAULT): CalciteParser.Config = { val configBuilder = CalciteParser.configBuilder .setQuoting(Quoting.BACK_TICK) .setUnquotedCasing(Casing.TO_UPPER) .setQuotedCasing(Casing.UNCHANGED) if (dialect == FlinkSqlDialect.DEFAULT) configBuilder.setParserFactory(FlinkSqlParserImpl.FACTORY) else configBuilder.setParserFactory(FlinkHiveSqlParserImpl.FACTORY) configBuilder.build } /** * 构建flink default的SqlParser config */ private[this] def createParserConfig: CalciteParser.Config = this.createParserConfig() /** * 构建flink hive方言版的SqlParser config */ private[this] def createHiveParserConfig: CalciteParser.Config = this.createParserConfig(FlinkSqlDialect.HIVE) /** * 根据sql构建Calcite SqlParser */ def sqlParser(sql: String, config: CalciteParser.Config = this.createParserConfig): SqlNode = { CalciteParser.create(sql, config).parseStmt() } /** * SQL语法校验,如果语法错误,则返回错误堆栈 * @param sql * sql statement */ def sqlValidate(sql: String): Try[Unit] = { val retVal = Try { try { // 使用默认的sql解析器解析 val sqlNode = this.sqlParser(sql) } catch { case e: Throwable => { // 使用hive方言语法解析器解析 val sqlNode = this.sqlParser(sql, this.calciteHiveParserConfig) } } } if (retVal.isFailure) { ExceptionBus.post(retVal.failed.get, sql) } retVal } /** * SQL语法校验 * @param sql * sql statement * @return * true:校验成功 false:校验失败 */ def sqlLegal(sql: String): Boolean = this.sqlValidate(sql).isSuccess /** * 将schema、fieldName与fieldIndex信息维护到table中 */ private[this] def extendSchemaTable(schema: FlinkTableSchema): Unit = { if (schema != null && !schemaTable.containsRow(schema)) { for (i <- 0 until schema.getFieldCount) { schemaTable.put(schema, schema.getFieldName(i).get(), i) } } } /** * 将Row转为自定义bean,以JavaBean中的Field为基准 * bean中的field名称要与DataFrame中的field名称保持一致 * * @return */ def rowToBean[T](schema: FlinkTableSchema, row: Row, clazz: Class[T]): T = { requireNonEmpty(schema, row, clazz) val obj = clazz.newInstance() tryWithLog { this.extendSchemaTable(schema) clazz.getDeclaredFields.foreach(field => { ReflectionUtils.setAccessible(field) val anno = field.getAnnotation(classOf[FieldName]) val begin = if (anno == null) true else !anno.disuse() if (begin) { val fieldName = if (anno != null && ValueUtils.noEmpty(anno.value())) anno.value().trim else field.getName if (this.schemaTable.contains(schema, fieldName)) { val fieldIndex = this.schemaTable.get(schema, fieldName) field.set(obj, row.getField(fieldIndex)) } } }) if (obj.isInstanceOf[HBaseBaseBean[T]]) { val method = ReflectionUtils.getMethodByName(clazz, "buildRowKey") if (method != null) method.invoke(obj) } }(this.logger, catchLog = "flink row转为JavaBean过程中发生异常.") obj } /** * 解析并设置配置文件中的配置信息 */ def parseConf(config: ExecutionConfig): ExecutionConfig = { requireNonEmpty(config)("Flink配置实例不能为空") // flink.auto.generate.uid.enable=true 默认为:true if (FireFlinkConf.autoGenerateUidEnable) { config.enableAutoGeneratedUIDs() } else { config.disableAutoGeneratedUIDs() } // flink.auto.type.registration.enable=true 默认为:true if (!FireFlinkConf.autoTypeRegistrationEnable) { config.disableAutoTypeRegistration() } // flink.force.avro.enable=true 默认值为:false if (FireFlinkConf.forceAvroEnable) { config.enableForceAvro() } else { config.disableForceAvro() } // flink.force.kryo.enable=true 默认值为:false if (FireFlinkConf.forceKryoEnable) { config.enableForceKryo() } else { config.disableForceKryo() } // flink.generic.types.enable=true 默认值为:false if (FireFlinkConf.genericTypesEnable) { config.enableGenericTypes() } else { config.disableGenericTypes() } // flink.object.reuse.enable=true 默认值为:false if (FireFlinkConf.objectReuseEnable) { config.enableObjectReuse() } else { config.disableObjectReuse() } // flink.auto.watermark.interval=0 默认值为:0 if (FireFlinkConf.autoWatermarkInterval != -1) config.setAutoWatermarkInterval(FireFlinkConf.autoWatermarkInterval) // flink.closure.cleaner.level=recursive 默认值为:RECURSIVE,包括:RECURSIVE、NONE、TOP_LEVEL if (StringUtils.isNotBlank(FireFlinkConf.closureCleanerLevel)) config.setClosureCleanerLevel(ClosureCleanerLevel.valueOf(FireFlinkConf.closureCleanerLevel.toUpperCase)) // flink.default.input.dependency.constraint=any 默认值:ANY,包括:ANY、ALL if (StringUtils.isNotBlank(FireFlinkConf.defaultInputDependencyConstraint)) config.setDefaultInputDependencyConstraint(InputDependencyConstraint.valueOf(FireFlinkConf.defaultInputDependencyConstraint.toUpperCase)) // flink.execution.mode=pipelined 默认值:PIPELINED,包括:PIPELINED、PIPELINED_FORCED、BATCH、BATCH_FORCED if (StringUtils.isNotBlank(FireFlinkConf.executionMode)) config.setExecutionMode(ExecutionMode.valueOf(FireFlinkConf.executionMode.toUpperCase)) // flink.latency.tracking.interval=0 默认值:0 if (FireFlinkConf.latencyTrackingInterval != -1) config.setLatencyTrackingInterval(FireFlinkConf.latencyTrackingInterval) // flink.max.parallelism=1 没有默认值 if (FireFlinkConf.maxParallelism != -1) config.setMaxParallelism(FireFlinkConf.maxParallelism) // flink.task.cancellation.interval=1 无默认值 if (FireFlinkConf.taskCancellationInterval != -1) config.setTaskCancellationInterval(FireFlinkConf.taskCancellationInterval) // flink.task.cancellation.timeout.millis=1000 无默认值 if (FireFlinkConf.taskCancellationTimeoutMillis != -1) config.setTaskCancellationTimeout(FireFlinkConf.taskCancellationTimeoutMillis) // flink.use.snapshot.compression=false 默认值:false config.setUseSnapshotCompression(FireFlinkConf.useSnapshotCompression) config } /** * 加载指定路径下的udf jar包 */ def loadUdfJar: Unit = { val udfJarUrl = PropUtils.getString(FireFlinkConf.FLINK_SQL_CONF_UDF_JARS, "") if (StringUtils.isBlank(udfJarUrl)) { logger.warn(udfJarUrl, s"flink udf jar包路径不能为空,请在配置文件中通过:${FireFlinkConf.FLINK_SQL_CONF_UDF_JARS}=/path/to/udf.jar 指定") return } val method = classOf[URLClassLoader].getDeclaredMethod("addURL", classOf[URL]) method.setAccessible(true) val classLoader = ClassLoader.getSystemClassLoader.asInstanceOf[URLClassLoader] method.invoke(classLoader, new URL(udfJarUrl)) } /** * 判断当前环境是否为JobManager */ def isJobManager: Boolean = { if (this.jobManager.isEmpty) { val envClass = Class.forName("org.apache.flink.runtime.util.EnvironmentInformation") if (ReflectionUtils.containsMethod(envClass, "isJobManager")) { val method = envClass.getMethod("isJobManager") jobManager = Some((method.invoke(null) + "").toBoolean) } else { logger.error("未找到方法:EnvironmentInformation.isJobManager()") } } jobManager.getOrElse(true) } /** * 判断当前环境是否为TaskManager */ def isTaskManager: Boolean = !this.isJobManager /** * 获取flink的运行模式 */ def deployMode: String = { if (this.mode.isEmpty) { val globalConfClass = Class.forName("org.apache.flink.configuration.GlobalConfiguration") if (ReflectionUtils.containsMethod(globalConfClass, "getRunMode")) { val method = globalConfClass.getMethod("getRunMode") this.mode = Some(method.invoke(null) + "") } else { logger.error("未找到方法:GlobalConfiguration.getRunMode()") } } val deployMode = this.mode.getOrElse("yarn-per-job") if (isEmpty(deployMode) || "null".equalsIgnoreCase(deployMode)) "local" else deployMode } /** * 判断当前运行模式是否为yarn-application模式 */ def isYarnApplicationMode: Boolean = "yarn-application".equalsIgnoreCase(this.deployMode) /** * 判断当前运行模式是否为yarn-per-job模式 */ def isYarnPerJobMode: Boolean = "yarn-per-job".equalsIgnoreCase(this.deployMode) /** * 将Javabean中匹配的field值转为RowData * * @param bean * 任意符合JavaBean规范的实体对象 * @return * RowData实例 */ def bean2RowData(bean: Object, rowType: RowType): RowData = { requireNonEmpty(bean, rowType) val genericRowData = new GenericRowData(rowType.getFieldCount) val fieldNames = rowType.getFieldNames val clazz = bean.getClass // 以建表语句中声明的字段列表为标准进行循环 for (pos <- 0 until rowType.getFieldCount) { // 根据临时表的字段名称获取JavaBean中对应的同名的field的值 val field = ReflectionUtils.getFieldByName(clazz, fieldNames.get(pos)) requireNonEmpty(field, s"JavaBean中未找到名为${fieldNames.get(pos)}的field,请检查sql建表语句或JavaBean的声明!") val value = field.get(bean).toString // 进行类型匹配,将获取到的JavaBean中的字段值映射为SQL建表语句中所指定的类型,并设置到对应的field中 rowType.getTypeAt(pos).toString match { case "INT" | "TINYINT" | "SMALLINT" | "INTEGER" => genericRowData.setField(pos, value.toInt) case "BIGINT" => genericRowData.setField(pos, value.toLong) case "DOUBLE" => genericRowData.setField(pos, value.toDouble) case "FLOAT" => genericRowData.setField(pos, value.toFloat) case "BOOLEAN" => genericRowData.setField(pos, value.toBoolean) case "BYTE" => genericRowData.setField(pos, value.toByte) case "SHORT" => genericRowData.setField(pos, value.toShort) case fieldType if fieldType.contains("DECIMAL") => { // 获取SQL建表语句中的DECIMAL字段的精度 val accuracy = rowType.getTypeAt(pos).toString.replace("DECIMAL(", "").replace(")", "").split(",") genericRowData.setField(pos, DecimalData.fromBigDecimal(new JBigDecimal(value), accuracy(0).trim.toInt, accuracy(1).trim.toInt)) } case _ => genericRowData.setField(pos, new BinaryStringData(value)) } } genericRowData } /** * 将RowData中匹配的field值转为Javabean * * @param clazz * 任意符合JavaBean规范的Class类型 * @return * JavaBean实例 */ def rowData2Bean[T](clazz: Class[T], rowType: RowType, rowData: RowData): T = { requireNonEmpty(clazz, rowData) val bean = clazz.newInstance() val fieldNames = rowType.getFieldNames // 以建表语句中声明的字段列表为标准进行循环 for (pos <- 0 until rowType.getFieldCount) { // 根据临时表的字段名称获取JavaBean中对应的同名的field的值 val field = ReflectionUtils.getFieldByName(clazz, fieldNames.get(pos)) requireNonEmpty(field, s"JavaBean中未找到名为${fieldNames.get(pos)}的field,请检查sql建表语句或JavaBean的声明!") // 进行类型匹配,将获取到的JavaBean中的字段值映射为SQL建表语句中所指定的类型,并设置到对应的field中 rowType.getTypeAt(pos).toString match { case "INT" | "TINYINT" | "SMALLINT" | "INTEGER" => field.setInt(bean, rowData.getInt(pos)) case "BIGINT" => field.setLong(bean, rowData.getLong(pos)) case "DOUBLE" => field.setDouble(bean, rowData.getDouble(pos)) case "FLOAT" => field.setFloat(bean, rowData.getFloat(pos)) case "BOOLEAN" => field.setBoolean(bean, rowData.getBoolean(pos)) case "BYTE" => field.setByte(bean, rowData.getByte(pos)) case "SHORT" => field.setShort(bean, rowData.getShort(pos)) case fieldType if fieldType.contains("DECIMAL") => { // 获取SQL建表语句中的DECIMAL字段的精度 val accuracy = rowType.getTypeAt(pos).toString.replace("DECIMAL(", "").replace(")", "").split(",") field.set(bean, rowData.getDecimal(pos, accuracy(0).trim.toInt, accuracy(1).trim.toInt)) } case _ => field.set(bean, rowData.getString(pos).toString) } } bean } /** * 获取JobManager或TaskManager的标识 * @return * JobManager/container_xxx_xxx_xx_xxxx */ def getResourceId: String = { if (isJobManager) "JobManager" else PropUtils.getString("taskmanager.resource-id", OSUtils.getHostName) } /** * 获取applicationId */ def getApplicationId: String = PropUtils.getString("high-availability.cluster-id") /** * 获取flink版本号 */ def getVersion: String = EnvironmentInformation.getVersion /** * 替换sql中with表达式的value部分 * 如果配置中有与sql中相同的信息,则会被替换 * * @param originalSql * 原始含有敏感信息的SQL语句 * @return * 替换敏感信息后的SQL语句 */ def sqlWithConfReplace(originalSql: String): String = { if (!FireFlinkConf.sqlWithReplaceModeEnable) return originalSql var replacedSql = originalSql val repMap = new JHashMap[String, String]() // 正则匹配with表达式中的value部分 RegularUtils.withValueReg.findAllMatchIn(replacedSql).foreach(matchStr => { val withValue = matchStr.matched if (noEmpty(withValue)) { val matchValue = RegularUtils.valueReg.findFirstIn(withValue) if (matchValue.isDefined) { val oldValue = matchValue.get // 判断sql中的值与配置信息是否有匹配,存在匹配项则放入到map中等待下一步批量替换 val confValue = PropUtils.getString(matchValue.get.replace("'", ""), "") if (noEmpty(confValue)) { val replacedValue = if (noEmpty(confValue)) confValue else oldValue repMap.put(oldValue, s"'${replacedValue}'") } } } }) // 将存在配置的值进行替换 repMap.foreach(kv => { replacedSql = replacedSql.replace(kv._1, kv._2) }) replacedSql } /** * 替换sql中with表达式的options * 包含value变量替换与datasource数据源整体替换 * * @param originalSql * 原始含有敏感信息的SQL语句 * @return * 替换敏感信息后的SQL语句 */ def sqlWithReplace(originalSql: String): String = { val replacedSql = this.replaceSqlAlias(this.sqlWithConfReplace(originalSql)) logger.debug("Flink Sql with options替换成功,最终SQL:" + replacedSql) replacedSql } /** * 替换Flink Sql with表达式中的options选项,规则如下: * * 获取所有flink.sql.with.为前缀的配置信息如: * flink.sql.with.bill_db.connector = mysql * flink.sql.with.bill_db.url = jdbc:mysql://localhost:3306/fire * 上述配置标识定义名为bill_db的数据源,配置了两个options选项分别为: * connector = mysql * url = jdbc:mysql://localhost:3306/fire * sql中即可通过 'datasource'='bill_db' 引用到上述两项option */ def replaceSqlAlias(sql: String): String = { if (!FireFlinkConf.sqlWithReplaceModeEnable) return sql var replacedSql = sql val matchDatasource = RegularUtils.withDatasourceReg.findFirstIn(sql) if (matchDatasource.isDefined) { val matchValue = RegularUtils.withValueReg.findFirstIn(matchDatasource.get) if (matchValue.isDefined) { // 获取 'datasource'='value' 中的value值 val datasource = matchValue.get.replaceAll("=", "").replace("'", "").trim if (noEmpty(datasource)) { val optionsText = new JStringBuilder FireFlinkConf.flinkSqlWithOptions.foreach(options => { if (options._1.startsWith(s"${datasource}.")) { // 将配置文件中定义的数据源options拼接成flink sql with字句中的options:'key' = 'value' optionsText.append(s"""\t'${options._1.replace(s"${datasource}.", "")}'='${options._2}',\n""") } }) val optionsList = optionsText.toString if (noEmpty(optionsList)) { // 移除动态拼接的option列表中最后一行的逗号 val replaceLast = optionsList.substring(0, optionsList.lastIndexOf(",")) replacedSql = RegularUtils.withDatasourceReg.replaceFirstIn(sql, replaceLast) } } } } replacedSql } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/util/HivePartitionTimeExtractor.scala ================================================ package com.zto.fire.flink.util import org.apache.flink.table.data.TimestampData import org.apache.flink.table.filesystem.PartitionTimeExtractor import org.slf4j.LoggerFactory import java.time.format.DateTimeFormatter import java.time.{LocalDate, LocalDateTime, LocalTime} import java.util /** * hive分区时间提取器,分区格式为yyyyMMdd * * @author ChengLong 2021年7月30日13:56:16 */ private[fire] class HivePartitionTimeExtractor(pattern: String = "$ds") extends PartitionTimeExtractor { private val DEFAULT_PARTITION_FORMATTER = DateTimeFormatter.ofPattern("yyyyMMdd") private val HOUR_PARTITION_FORMATTER = DateTimeFormatter.ofPattern("yyyyMMdd HH") def this() { this("$ds") } override def extract(partitionKeys: util.List[String], partitionValues: util.List[String]): LocalDateTime = { var timestampString: String = null if (pattern == null) timestampString = partitionValues.get(0) else { timestampString = pattern for (i <- 0 until partitionKeys.size) { timestampString = timestampString.replaceAll("\\$" + partitionKeys.get(i), partitionValues.get(i)) } } toLocalDateTime(timestampString).plusHours(-8) } def toLocalDateTime(timestampString: String): LocalDateTime = { try { LocalDateTime.of( LocalDate.parse(timestampString, DEFAULT_PARTITION_FORMATTER), LocalTime.MIDNIGHT) } catch { case e: Exception => { LocalDateTime.of( LocalDate.parse(timestampString, HOUR_PARTITION_FORMATTER), LocalTime.MIDNIGHT) throw e } } } def toMills(dateTime: LocalDateTime): Long = TimestampData.fromLocalDateTime(dateTime).getMillisecond def toMills(timestampString: String): Long = toMills(toLocalDateTime(timestampString)) } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/util/RocketMQUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.util import com.zto.fire._ import com.zto.fire.common.conf.FireRocketMQConf import com.zto.fire.common.util.{LogUtils, Logging} import org.apache.commons.lang3.StringUtils import org.apache.rocketmq.flink.RocketMQConfig /** * RocketMQ相关工具类 * * @author ChengLong * @since 2.0.0 * @create 2021-5-6 14:04:53 */ object RocketMQUtils extends Logging { /** * rocketMQ配置信息 * * @param groupId * 消费组 * @return * rocketMQ相关配置 */ def rocketParams(rocketParam: JMap[String, String] = null, topics: String = null, groupId: String = null, rocketNameServer: String = null, tag: String = null, keyNum: Int = 1): JMap[String, String] = { val optionParams = if (rocketParam != null) rocketParam else new JHashMap[String, String]() if (StringUtils.isNotBlank(topics)) optionParams.put(RocketMQConfig.CONSUMER_TOPIC, topics) if (StringUtils.isNotBlank(groupId)) optionParams.put(RocketMQConfig.CONSUMER_GROUP, groupId) // rocket name server 配置 val confNameServer = FireRocketMQConf.rocketNameServer(keyNum) val finalNameServer = if (StringUtils.isNotBlank(confNameServer)) confNameServer else rocketNameServer if (StringUtils.isNotBlank(finalNameServer)) optionParams.put(RocketMQConfig.NAME_SERVER_ADDR, finalNameServer) // tag配置 val confTag = FireRocketMQConf.rocketConsumerTag(keyNum) val finalTag = if (StringUtils.isNotBlank(confTag)) confTag else tag if (StringUtils.isNotBlank(finalTag)) optionParams.put(RocketMQConfig.CONSUMER_TAG, finalTag) // 起始消费位点 val confOffset = FireRocketMQConf.rocketStartingOffset(keyNum) if (StringUtils.isNotBlank(confOffset)) optionParams.put(RocketMQConfig.CONSUMER_OFFSET_RESET_TO, confOffset) // 以rocket.conf.开头的配置优先级最高 val confMap = FireRocketMQConf.rocketConfMap(keyNum) if (confMap.nonEmpty) optionParams.putAll(confMap) // 日志记录RocketMQ的配置信息 LogUtils.logMap(this.logger, optionParams.toMap, s"RocketMQ configuration. keyNum=$keyNum.") optionParams } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire/flink/util/StateCleanerUtils.scala ================================================ package com.zto.fire.flink.util import java.io.{BufferedInputStream, DataInputStream, File, FileInputStream} import java.net.URI import java.util.Date import java.util.regex.{Matcher, Pattern} import com.zto.fire._ import com.zto.fire.common.anno.Internal import com.zto.fire.common.util.UnitFormatUtils.DateUnitEnum import com.zto.fire.common.util.{DateFormatUtils, Logging, UnitFormatUtils} import com.zto.fire.flink.conf.FireFlinkConf import org.apache.commons.lang3.time.DateUtils import org.apache.flink.runtime.checkpoint.{Checkpoints, OperatorSubtaskState} import org.apache.flink.runtime.state.IncrementalRemoteKeyedStateHandle import org.apache.flink.runtime.state.filesystem.FileStateHandle import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.log4j.{Level, Logger} import scala.collection.mutable.ListBuffer /** * flink历史失效状态清理工具 * 清理策略: * conservativeModel:筛选出不再使用的checkpoint文件,将这些文件归档至指定的目录中,并定期删除指定时间的数据 * 直接删除模式:直接删除不再需要的checkpoint文件 * * @author ChengLong 2021-9-6 15:06:21 */ protected[fire] class StateCleanerUtils extends Logging { Logger.getLogger(this.getClass).setLevel(Level.toLevel("info")) // ------------------------------ hdfs 选项 ----------------------------------- // protected val hdfs = FireFlinkConf.stateHdfsUrl protected val hdfsUser = "hadoop" // ------------------------------ checkpoint 选项 ------------------------------ // protected val checkpointDir = "/user/flink/checkpoint" protected val localCheckpointBaseDir = "./home/checkpoint" protected val archiveDir = "/user/flink/archive" // 用于存放当前线上flink任务需要使用到的状态绝对路径 protected val inuserSet = new JHashSet[String]() // download到本地的metadata文件是否采用覆盖的方式避免本地磁盘存放过多的文件 protected val overwrite = true // 是否将失效的状态文件移到到回收站,等待后续清理 protected val conservativeModel = true // 用于存放遍历的checkpoint文件,避免二次遍历导致漏分析的文件被标记为删除 protected val files = ListBuffer[LocatedFileStatus]() // checkpoint元数据的过期时间,AccessTime超过该时间的将会被清理 protected val checkpointTTL = 62 // 计算出checkpointTtl对应的unix时间戳 protected val checkpointTTLStamp = DateUtils.addDays(new Date, -this.checkpointTTL).getTime // 是否删除空文件夹 protected val deleteEmptyDirEnabled = true // ture表示使用访问时间,false表示使用修改时间 protected val useAccessTime = false // ------------------------------ checkpoint归档选项 ---------------------------- // // 默认清理多少天之前的归档checkpoint文件 protected val archiveTTL = 7 protected val archiveTTLStamp = DateUtils.addDays(new Date, -this.archiveTTL).getTime // 用于指定是否删除过期的checkpoint归档文件 protected val deleteArchiveEnabled = true // ------------------------------ savepoint 选项 ------------------------------- // protected val savepointDir = "/user/flink/savepoint" // savepoint的ttl时间 protected val savepointTTL = 10 protected val savepointTTLStamp = DateUtils.addDays(new Date, -this.savepointTTL).getTime // 用于指定是否清理过期savepoint protected val deleteSavepointEnabled = true // ------------------------------ savepoint 选项 ------------------------------- // protected val completedDir = "/user/flink/completed-jobs" protected val completedTTL = 31 protected val completedTTLStamp = DateUtils.addDays(new Date, -this.completedTTL).getTime protected val deleteCompleteJobEnable = true // 指定checkpoint与savepoint的路径 protected val checkpoint_pattern = Pattern.compile("/user/flink/checkpoint/") //指定任务id protected val savepoint_pattern = Pattern.compile("/user/flink/savepoint/") //指定任务id /** * 获取HDFS的FileSystem对象 */ @Internal protected def getFileSystem: FileSystem = { val fs = FileSystem.get(new URI(this.hdfs), new Configuration(), this.hdfsUser) fs.setWorkingDirectory(new Path("/")) fs } /** * 解析 operatorSubtaskState 的 ManagedKeyedState * * @param operatorSubtaskState operatorSubtaskState */ @Internal protected def parseManagedKeyedState(operatorSubtaskState: OperatorSubtaskState): Unit = { if (noEmpty(operatorSubtaskState)) { // 本案例针对 Flink RocksDB 的增量 Checkpoint 引发的问题, // 因此仅处理 IncrementalRemoteKeyedStateHandle operatorSubtaskState.getManagedKeyedState.filter(_.isInstanceOf[IncrementalRemoteKeyedStateHandle]) .map(_.asInstanceOf[IncrementalRemoteKeyedStateHandle]).foreach(keyedStateHandle => { // 获取 RocksDB 的 sharedState val sharedState = keyedStateHandle.getSharedState if (noEmpty(sharedState)) { sharedState.map(t => t._2).filter(_.isInstanceOf[FileStateHandle]).map(_.asInstanceOf[FileStateHandle]) .foreach(t => { val filePath = t.getFilePath this.logger.info("parseManagedKeyedState:" + filePath) this.inuserSet.add(filePath.getPath) }) } }) } } /** * 解析 operatorSubtaskState 的 ManagedOperatorState * * @param operatorSubtaskState operatorSubtaskState */ @Internal protected def parseManagedOperatorState(operatorSubtaskState: OperatorSubtaskState): Unit = { if (isEmpty(operatorSubtaskState)) { operatorSubtaskState.getManagedOperatorState.map(_.getDelegateStateHandle).filter(_.isInstanceOf[FileStateHandle]).map(_.asInstanceOf[FileStateHandle]).foreach(fileStateHandle => { val filePath = fileStateHandle.getFilePath this.logger.info("parseManagedKeyedState:" + filePath) this.inuserSet.add(filePath.getPath) }) } } /** * 递归遍历checkpoint目录下所有的_metadata文件 */ @Internal protected def recursionCheckpointDir(): Unit = { var count = 0 var fs: FileSystem = null tryFinally { fs = this.getFileSystem val path = new Path(this.checkpointDir) //增加checkpoint路径的正则匹配 val it = fs.listFiles(path, true) while (it.hasNext) { val status = it.next() val matcher: Matcher = checkpoint_pattern.matcher(status.getPath().toUri.getPath + "/") if (matcher.find) { this.files += status this.logger.info(status.getPath().toUri.getPath) val timeFlag = if (this.useAccessTime) status.getAccessTime else status.getModificationTime // 只分析最近访问时间在配置的metadataTtl之后的metadata文件,也就是说默认62天之前仍未被访问或修改的metadata文件将会被删除 if (status.getPath.getName.endsWith("_metadata") && (timeFlag > this.checkpointTTLStamp)) { // 获取metadata在hdfs上的相对路径 val metadataPath = status.getPath.toString.replace(this.hdfs, "") this.inuserSet.add(metadataPath) this.logger.info(s"开始分析metadata文件:${metadataPath}") // 是否复用同一个本地元数据的路径,如果复用,则分析完成后就会被下一个元数据文件覆盖,否则会保留所有的metadata文件 val localPath = if (this.overwrite) this.localCheckpointBaseDir + "/_metadata" else this.localCheckpointBaseDir + metadataPath // 将metadata文件拷贝到本地进行分析 fs.copyToLocalFile(status.getPath, new Path(localPath)) this.analyzeMetadata(localPath, status.getPath.getParent.toString) count += 1 } } } this.logger.info(s"此次分析metadata文件数共计:${count}") this.logger.info(s"此次inuserSet文件数共计:${inuserSet.size()}") }(if (fs != null) fs.close())(this.logger, catchLog = "分析metadata文件发生异常", finallyCatchLog = "FileSystem.close()失败") } /** * 清理不再被使用的状态数据 */ protected def cleanCheckpoint(): Unit = { var count = 0 var blockSize = 0L var fs: FileSystem = null tryFinally { fs = this.getFileSystem val newFilePath = new Path(s"${this.archiveDir}/${DateFormatUtils.formatCurrentDate()}") fs.mkdirs(newFilePath) this.files.foreach(status => { val currentFile = status.getPath.toString.replace(this.hdfs, "") if (!this.inuserSet.contains(currentFile)) { if (this.conservativeModel) { // 保守模式下仅将过期的状态文件移动至指定的文件夹中,等待后续的单独处理 val subPath = status.getPath.getParent.toString.replace(this.hdfs, "").replace(this.checkpointDir + "/", "") val destPath = new Path(s"${this.archiveDir}/${DateFormatUtils.formatCurrentDate()}/$subPath") fs.mkdirs(destPath) fs.rename(status.getPath, destPath) this.logger.info(s"移动状态文件:${status.getPath.toString} to ${destPath.toString}") } else { // 非保守模式下,直接删除失效的状态文件 fs.delete(status.getPath, true) this.logger.info(s"删除状态文件:${status.getPath}") } count += 1 blockSize += status.getBlockSize } }) this.logger.info(s"清理过期文件数:${count},释放磁盘空间:${UnitFormatUtils.readable(blockSize, DateUnitEnum.BYTE)}") }(if (fs != null) fs.close())(this.logger, catchLog = "删除/归档checkpoint文件过程中发生异常", finallyCatchLog = "FileSystem.close()失败") } /** * 通过解析指定的_metadata分析还在被使用的checkpoint文件 * * externalPointer 设置为 当前解析_metadata 的父目录即可 * 解决 状态反序列化中 type为 RELATIVE_STREAM_STATE_HANDLE 导致报错 * Cannot deserialize a RelativeFileStateHandle without a context to make it relative to * * @param path * metadata的绝对路径 */ @Internal protected def analyzeMetadata(path: String, externalPointer: String): Unit = { // 读取元数据文件 val metadataFile = new File(path) var fis: FileInputStream = null var bis: BufferedInputStream = null var dis: DataInputStream = null tryFinally { // 通过IO流获取本地的metadata文件 fis = new FileInputStream(metadataFile) bis = new BufferedInputStream(fis) dis = new DataInputStream(bis) val checkpointMetadata = Checkpoints.loadCheckpointMetadata(dis, this.getClass.getClassLoader, externalPointer) // 遍历 OperatorState,这里的每个 OperatorState 对应一个 Flink 任务的 Operator 算子 // 不要与 OperatorState 和 KeyedState 混淆,不是一个层级的概念 checkpointMetadata.getOperatorStates.filter(_.getStateSize > 0).foreach(operatorState => { this.logger.debug(s"算子状态:${operatorState}") // 遍历当前算子的所有 subtask operatorState.getStates.foreach(operatorSubtaskState => { // 解析 operatorSubtaskState 的 ManagedKeyedState this.parseManagedKeyedState(operatorSubtaskState) // 解析 operatorSubtaskState 的 ManagedOperatorState this.parseManagedOperatorState(operatorSubtaskState) }) }) }(if (dis != null) dis.close())(this.logger, catchLog = "解析metadata文件过程中出现异常", finallyCatchLog = "关闭IO流过程中出现异常") } /** * 删除过期的归档文件 */ protected def deleteArchive(): Unit = { if (!this.deleteArchiveEnabled || !this.conservativeModel) return var fs: FileSystem = null var count = 0 tryFinally { fs = this.getFileSystem val path = new Path(this.archiveDir) val files = fs.listStatus(path) files.filter(file => file.isDirectory).foreach(file => { val timeFlag = if (this.useAccessTime) file.getAccessTime else file.getModificationTime // 清理checkpoint归档目录 if (timeFlag < this.archiveTTLStamp) { fs.delete(file.getPath, true) count += 1 this.logger.info(s"清理checkpoint归档目录成功:${file.getPath},归档时间:${DateFormatUtils.formatDateTime(new Date(timeFlag))}") } }) this.logger.info(s"本次清理checkpoint归档目录共计:${count}个") }(if (fs != null) fs.close())(this.logger, catchLog = "清理checkpoint归档目录出现异常", finallyCatchLog = "FileSystem.close()失败") } /** * 删除空文件夹,文件夹总大小为0的目录会被清空 */ protected def deleteEmptyDir(): Unit = { if (!this.deleteEmptyDirEnabled) return var fs: FileSystem = null var count = 0 tryFinally { fs = this.getFileSystem val path = new Path(this.checkpointDir) //获取指定目录 val files = fs.listStatus(path, new PathFilter { override def accept(p: Path): Boolean = { val matcher: Matcher = checkpoint_pattern.matcher(p.toUri.getPath + "/") matcher.find() } }) files.filter(file => file.isDirectory).foreach(file => { val checkpointList: Array[FileStatus] = fs.listStatus(file.getPath) checkpointList.foreach(file => { val size = fs.getContentSummary(file.getPath).getLength if (size == 0) { fs.delete(file.getPath, true) count += 1 this.logger.info(s"清理空文件夹:${file.getPath},空文件时间:${DateFormatUtils.formatDateTime(new Date(file.getAccessTime))}") } }) }) this.logger.info(s"本次清理空文件夹共计:${count}个") }(if (fs != null) fs.close())(this.logger, catchLog = "清理空文件过程中出现异常", finallyCatchLog = "FileSystem.close()失败") } /** * 定期清理过期的savepoint文件 */ protected def deleteSavepoint(): Unit = { if (!this.deleteSavepointEnabled) return var fs: FileSystem = null var count = 0 tryFinally { fs = this.getFileSystem val path = new Path(this.savepointDir) //获取满足指定目录的 flink savepoint目录 val files = fs.listStatus(path, new PathFilter { override def accept(p: Path): Boolean = { val matcher: Matcher = savepoint_pattern.matcher(p.toUri.getPath + "/") matcher.find() } }) files.filter(file => file.isDirectory).foreach(file => { val savepointList: Array[FileStatus] = fs.listStatus(file.getPath) savepointList.foreach(file => { val timeFlag = if (this.useAccessTime) file.getAccessTime else file.getModificationTime if (timeFlag < this.savepointTTLStamp) { //TODO 考虑是否可以改为删除至回收站 // val t = new Trash(fs.getConf) // t.moveToTrash(file.getPath) fs.delete(file.getPath, true) count += 1 this.logger.info(s"清理savepoint目录成功:${file.getPath},savepoint时间:${DateFormatUtils.formatDateTime(new Date(timeFlag))}") } }) }) this.logger.info(s"本次清理savepoint共计:${count}个") }(if (fs != null) fs.close())(this.logger, catchLog = "清理savepoint文件过程中出现异常", finallyCatchLog = "FileSystem.close()失败") } /** * 定期清理过期的complete job文件 */ protected def deleteCompleteJobs(): Unit = { if (!this.deleteCompleteJobEnable) return var fs: FileSystem = null var count = 0 tryFinally { fs = this.getFileSystem val path = new Path(this.completedDir) val files = fs.listStatus(path) files.foreach(file => { val timeFlag = if (this.useAccessTime) file.getAccessTime else file.getModificationTime if (timeFlag < this.completedTTLStamp) { fs.delete(file.getPath, true) count += 1 this.logger.info(s"清理completed job目录成功:${file.getPath},completed job时间:${DateFormatUtils.formatDateTime(new Date(timeFlag))}") } }) this.logger.info(s"本次清理completed job共计:${count}个") }(if (fs != null) fs.close())(this.logger, catchLog = "清理清理completed job文件过程中出现异常", finallyCatchLog = "FileSystem.close()失败") } /** * 执行清理任务 */ protected def run(): Unit = { elapsed[Unit]("step 5. 清理完毕,执行结束", this.logger) { this.logger.info("开始执行新checkpoint与savepoint清理程序...") this.logger.warn(s"step 1. 开始解析${checkpointTTL}天内增量checkpoint metadata文件并分析直接的血缘关系.") this.recursionCheckpointDir() this.logger.warn("step 2. 开始归档历史的checkpoint文件.") this.cleanCheckpoint() this.logger.warn(s"step 3. 开始清理${archiveTTL}天前过期的checkpoint归档文件.") this.deleteArchive() this.logger.warn("step 3. 开始清理checkpoint空文件夹.") this.deleteEmptyDir() this.logger.warn(s"step 4. 开始清理${savepointTTL}天前过期的savepoint文件.") this.deleteSavepoint() this.logger.warn(s"step 5. 开始清理${completedTTL}天前过期的completed job文件.") this.deleteCompleteJobs() } } } ================================================ FILE: fire-engines/fire-flink/src/main/scala/com/zto/fire.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto import com.zto.fire.core.ext.BaseFireExt import com.zto.fire.flink.ext.batch.{BatchExecutionEnvExt, BatchTableEnvExt, DataSetExt} import com.zto.fire.flink.ext.function.{RichFunctionExt, RuntimeContextExt} import com.zto.fire.flink.ext.stream._ import org.apache.flink.api.common.functions.{RichFunction, RuntimeContext} import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment} import org.apache.flink.streaming.api.scala.{DataStream, KeyedStream, StreamExecutionEnvironment} import org.apache.flink.table.api.{Table, TableEnvironment, TableResult} import org.apache.flink.types.Row /** * 预定义fire框架中的扩展工具 * * @author ChengLong * @since 1.0.0 * @create 2020-12-22 13:51 */ package object fire extends BaseFireExt { /** * StreamExecutionEnvironment扩展 * * @param env * StreamExecutionEnvironment对象 */ implicit class StreamExecutionEnvExtBridge(env: StreamExecutionEnvironment) extends StreamExecutionEnvExt(env) { } /** * StreamTableEnvironment扩展 * * @param tableEnv * StreamTableEnvironment对象 */ implicit class TableEnvExtBridge(tableEnv: TableEnvironment) extends TableEnvExt(tableEnv) { } /** * DataStream扩展 * * @param dataStream * DataStream对象 */ implicit class DataStreamExtBridge[T](dataStream: DataStream[T]) extends DataStreamExt(dataStream) { } /** * KeyedStream扩展 * * @param keyedStream * KeyedStream对象 */ implicit class KeyedStreamExtBridge[T, K](keyedStream: KeyedStream[T, K]) extends KeyedStreamExt[T, K](keyedStream) { } /** * TableResult扩展 */ implicit class TableResultImplBridge(tableResult: TableResult) extends TableResultImplExt(tableResult) { } /** * Table扩展 * * @param table * Table对象 */ implicit class StreamTableExtBridge(table: Table) extends TableExt(table) { } /** * BatchTableEnvironment扩展 * * @param tableEnv * BatchTableEnvironment对象 */ implicit class BatchTableEnvExtBridge(tableEnv: TableEnvironment) extends BatchTableEnvExt(tableEnv) { } /** * ExecutionEnvironment扩展 * * @param env * ExecutionEnvironment对象 */ implicit class BatchExecutionEnvExtBridge(env: ExecutionEnvironment) extends BatchExecutionEnvExt(env) { } /** * DataSet扩展 * * @param dataSet * DataSet对象 */ implicit class DataSetExtBridge[T](dataSet: DataSet[T]) extends DataSetExt(dataSet) { } /** * Row扩展 */ implicit class RowExtBridge(row: Row) extends RowExt(row) { } /** * Flink SQL扩展 */ implicit class SQLExtBridge(sql: String) extends SQLExt(sql) { } /** * Flink RuntimeContext扩展 */ implicit class RuntimeContextExtBridge(runtimeContext: RuntimeContext) extends RuntimeContextExt(runtimeContext) { } /** * Flink RichFunction扩展 */ implicit class RichFunctionExtBridge(richFunction: RichFunction) extends RichFunctionExt(richFunction) { } } ================================================ FILE: fire-engines/fire-flink/src/main/scala-flink-1.12/com/zto/fire/flink/sql/FlinkSqlParser.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.sql import com.zto.fire.common.anno.Internal /** * Flink SQL解析器,用于解析Flink SQL语句中的库、表、分区、操作类型等信息 * * @author ChengLong 2021-6-18 16:41:04 * @since 2.0.0 */ @Internal private[fire] object FlinkSqlParser extends FlinkSqlParserBase { } ================================================ FILE: fire-engines/fire-flink/src/main/scala-flink-1.13/com/zto/fire/flink/sql/FlinkSqlParser.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.sql import com.zto.fire.common.anno.Internal /** * Flink SQL解析器,用于解析Flink SQL语句中的库、表、分区、操作类型等信息 * * @author ChengLong 2021-6-18 16:41:04 * @since 2.0.0 */ @Internal private[fire] object FlinkSqlParser extends FlinkSqlParserBase { } ================================================ FILE: fire-engines/fire-flink/src/main/scala-flink-1.14/com/zto/fire/flink/sql/FlinkSqlParser.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.flink.sql import com.zto.fire.common.anno.Internal /** * Flink SQL解析器,用于解析Flink SQL语句中的库、表、分区、操作类型等信息 * * @author ChengLong 2021-6-18 16:41:04 * @since 2.0.0 */ @Internal private[fire] object FlinkSqlParser extends FlinkSqlParserBase { } ================================================ FILE: fire-engines/fire-spark/pom.xml ================================================ 4.0.0 fire-spark_${spark.reference} jar Fire : Engines : Spark com.zto.fire fire-engines 2.3.2-SNAPSHOT ../pom.xml com.zto.fire fire-enhance-spark_${spark.reference} ${fire.version} ${maven.scope} com.zto.fire fire-connector-spark-rocketmq_${spark.reference} ${fire.version} ${maven.scope} com.zto.fire fire-connector-spark-hbase_${spark.reference} ${fire.version} ${maven.scope} org.apache.spark spark-core_${scala.binary.version} com.esotericsoftware.kryo kryo ${spark.version} ${maven.scope} org.apache.spark spark-sql_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.spark spark-streaming_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.spark spark-sql-kafka-0-10_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.spark spark-streaming_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.spark spark-streaming-kafka-0-10_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.hadoop hadoop-common ${hadoop.version} ${maven.scope} org.apache.hadoop hadoop-hdfs ${hadoop.version} ${maven.scope} org.apache.hadoop hadoop-client ${hadoop.version} ${maven.scope} org.apache.hbase hbase-common ${hbase.version} ${maven.scope} org.apache.hbase hbase-client org.apache.hbase hbase-server ${hbase.version} ${maven.scope} org.apache.hbase hbase-client org.apache.hbase hbase-client ${hbase.version} org.apache.rocketmq rocketmq-client ${rocketmq.version} ${maven.scope} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-engines/fire-spark/src/main/java/com/zto/fire/spark/anno/SparkConf.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.anno; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解进行任务的配置,支持纯注解方式进行Spark相关参数配置 * * @author ChengLong 2022-08-18 08:57:23 * @since 2.3.2 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface SparkConf { /** * 配置项列表,key=value的字符串形式 */ String[] props() default ""; /** * 配置的字符串 */ String value() default ""; } ================================================ FILE: fire-engines/fire-spark/src/main/java/com/zto/fire/spark/anno/Streaming.java ================================================ package com.zto.fire.spark.anno; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * 基于注解的方式配置Spark Streaming任务 * * @author ChengLong * @date 2022-04-30 21:44:19 * @since 2.2.1 */ @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface Streaming { /** * 批次时间(s) */ int value() default 10; /** * 批次时间(s),同value字段 */ int interval() default -1; /** * 是否开启spark streaming的checkpoint */ boolean checkpoint() default false; /** * 是否自动提交job:call startAwaitTermination() */ boolean autoStart() default true; /** * 并行执行的streaming批次数 */ int concurrent() default -1; /** * 指定消费kafka或rocketmq每秒从每个分区获取的最大记录数 */ long maxRatePerPartition() default -1; /** * 是否启用反压机制 */ boolean backpressure() default true; /** * 启用反压机制时每个接收器接收第一批数据的初始最大速率 */ long backpressureInitialRate() default -1; /** * 是否优雅的停止streaming */ boolean stopGracefullyOnShutdown() default true; } ================================================ FILE: fire-engines/fire-spark/src/main/java/com/zto/fire/spark/anno/StreamingDuration.java ================================================ package com.zto.fire.spark.anno; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; /** * Spark Streaming任务的批次时间 * * @author ChengLong 2021年8月3日19:39:28 * @since 2.1.1 */ @Deprecated @Target(ElementType.TYPE) @Retention(RetentionPolicy.RUNTIME) public @interface StreamingDuration { /** * 批次时间(s) */ int value() default 10; /** * 批次时间(s),同value字段 */ int interval() default -1; /** * 是否开启spark streaming的checkpoint */ boolean checkpoint() default false; } ================================================ FILE: fire-engines/fire-spark/src/main/java/com/zto/fire/spark/bean/ColumnMeta.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.bean; /** * 用于封装字段元数据 * * @author ChengLong 2019-9-2 13:19:06 */ public class ColumnMeta { // 所在数据库名称 protected String database; // 表名 protected String tableName; // 字段描述 protected String description; // 字段名 protected String columnName; // 字段类型 protected String dataType; // 是否允许为空 protected Boolean nullable; // 是否为分区字段 protected Boolean isPartition; // 是否为bucket字段 protected Boolean isBucket; public ColumnMeta() { } private ColumnMeta(Builder builder) { this.nullable = builder.nullable; this.tableName = builder.tableName; this.columnName = builder.columnName; this.database = builder.database; this.dataType = builder.dataType; this.description = builder.description; this.isBucket = builder.isBucket; this.isPartition = builder.isPartition; } public String getDatabase() { return database; } public String getTableName() { return tableName; } public String getDescription() { return description; } public String getColumnName() { return columnName; } public String getDataType() { return dataType; } public Boolean getNullable() { return nullable; } public Boolean getPartition() { return isPartition; } public Boolean getBucket() { return isBucket; } public static class Builder extends ColumnMeta { public Builder setDescription(String description) { this.description = description; return this; } public Builder setColumnName(String columnName) { this.columnName = columnName; return this; } public Builder setDataType(String dataType) { this.dataType = dataType; return this; } public Builder setNullable(Boolean nullable) { this.nullable = nullable; return this; } public Builder setPartition(Boolean partition) { isPartition = partition; return this; } public Builder setBucket(Boolean bucket) { isBucket = bucket; return this; } public Builder setDatabase(String database) { this.database = database; return this; } public Builder setTableName(String tableName) { this.tableName = tableName; return this; } public ColumnMeta build() { return new ColumnMeta(this); } } } ================================================ FILE: fire-engines/fire-spark/src/main/java/com/zto/fire/spark/bean/FunctionMeta.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.bean; /** * 用于封装函数元数据信息 * @author ChengLong 2019-9-2 16:50:50 */ public class FunctionMeta { // 函数描述 private String description; // 数据库 private String database; // 函数名称 private String name; // 函数定义的类 private String className; // 是否为临时函数 private Boolean isTemporary; public FunctionMeta() { } public FunctionMeta(String description, String database, String name, String className, Boolean isTemporary) { this.description = description; this.database = database; this.name = name; this.className = className; this.isTemporary = isTemporary; } public String getDescription() { return description; } public void setDescription(String description) { this.description = description; } public String getDatabase() { return database; } public void setDatabase(String database) { this.database = database; } public String getName() { return name; } public void setName(String name) { this.name = name; } public String getClassName() { return className; } public void setClassName(String className) { this.className = className; } public Boolean getTemporary() { return isTemporary; } public void setTemporary(Boolean temporary) { isTemporary = temporary; } } ================================================ FILE: fire-engines/fire-spark/src/main/java/com/zto/fire/spark/bean/GenerateBean.java ================================================ package com.zto.fire.spark.bean; import java.util.List; /** * 自动生成数据的JavaBean父类 * * @param 子类具体的类型 * @author ChengLong 2022-03-07 14:49:51 * @since 2.2.1 */ public interface GenerateBean> { /** * 自动生成对象实例的具体逻辑 * @return 对象实例 */ List generate(); } ================================================ FILE: fire-engines/fire-spark/src/main/java/com/zto/fire/spark/bean/RestartParams.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.bean; import java.util.Map; /** * 重启streaming参数 * {"batchDuration":10,"restartSparkContext":false,"stopGracefully": false,"sparkConf":{"spark.streaming.concurrentJobs":"2"}} * @author ChengLong 2019-5-5 16:57:49 */ public class RestartParams { // 批次时间(秒) private long batchDuration; // 是否重启SparkContext对象 private boolean restartSparkContext; // 是否等待数据全部处理完成再重启 private boolean stopGracefully; // 是否做checkPoint private boolean isCheckPoint; // 附加的conf信息 private Map sparkConf; public long getBatchDuration() { return batchDuration; } public void setBatchDuration(long batchDuration) { this.batchDuration = batchDuration; } public boolean isRestartSparkContext() { return restartSparkContext; } public void setRestartSparkContext(boolean restartSparkContext) { this.restartSparkContext = restartSparkContext; } public Map getSparkConf() { return sparkConf; } public void setSparkConf(Map sparkConf) { this.sparkConf = sparkConf; } public RestartParams() { } public boolean isStopGracefully() { return stopGracefully; } public void setStopGracefully(boolean stopGracefully) { this.stopGracefully = stopGracefully; } public boolean isCheckPoint() { return isCheckPoint; } public void setCheckPoint(boolean checkPoint) { isCheckPoint = checkPoint; } public RestartParams(long batchDuration, boolean restartSparkContext, boolean stopGracefully, boolean isCheckPoint, Map sparkConf) { this.batchDuration = batchDuration; this.restartSparkContext = restartSparkContext; this.stopGracefully = stopGracefully; this.isCheckPoint = isCheckPoint; this.sparkConf = sparkConf; } } ================================================ FILE: fire-engines/fire-spark/src/main/java/com/zto/fire/spark/bean/SparkInfo.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.bean; import com.zto.fire.common.util.DateFormatUtils; import java.util.Map; /** * 用于封装spark运行时的信息 * @author ChengLong 2019-5-13 10:27:33 */ public class SparkInfo { /** * spark应用名称 */ private String appName; /** * spark应用的类名 */ private String className; /** * common包的版本号 */ private String fireVersion; /** * spark conf信息 */ private Map conf; /** * 当前spark版本 */ private String version; /** * spark 运行模式 */ private String master; /** * spark 的 applicationId */ private String applicationId; /** * yarn 的 applicationAttemptId */ private String applicationAttemptId; /** * spark 的 webui地址 */ private String ui; /** * driver的进程id */ private String pid; /** * spark的运行时间 */ private String uptime; /** * 程序启动的起始时间 */ private String launchTime; /** * 申请的每个executor的内存大小 */ private String executorMemory; /** * 申请的executor个数 */ private String executorInstances; /** * 申请的每个executor的cpu数 */ private String executorCores; /** * 申请的driver cpu数量 */ private String driverCores; /** * 申请的driver内存大小 */ private String driverMemory; /** * 申请的driver堆外内存大小 */ private String driverMemoryOverhead; /** * driver所在服务器ip */ private String driverHost; /** * driver占用的端口号 */ private String driverPort; /** * restful接口的端口号 */ private String restPort; /** * 申请的executor堆外内存大小 */ private String executorMemoryOverhead; /** * 当前spark应用申请的总内存大小(driver+executor+总的堆外内存) */ private String memory; /** * 当前spark应用申请的总的cpu数量(driver+executor) */ private String cpu; /** * streaming批次时间 */ private String batchDuration; /** * 当前driver系统时间 */ private String timestamp = DateFormatUtils.formatCurrentDateTime(); /** * 配置信息 */ private Map properties; public String getAppName() { return appName; } public void setAppName(String appName) { this.appName = appName; } public String getClassName() { return className; } public void setClassName(String className) { this.className = className; } public String getFireVersion() { return fireVersion; } public void setFireVersion(String fireVersion) { this.fireVersion = fireVersion; } public Map getConf() { return conf; } public void setConf(Map conf) { this.conf = conf; } public String getVersion() { return version; } public void setVersion(String version) { this.version = version; } public String getMaster() { return master; } public void setMaster(String master) { this.master = master; } public String getApplicationId() { return applicationId; } public void setApplicationId(String applicationId) { this.applicationId = applicationId; } public String getApplicationAttemptId() { return applicationAttemptId; } public void setApplicationAttemptId(String applicationAttemptId) { this.applicationAttemptId = applicationAttemptId; } public String getUi() { return ui; } public void setUi(String ui) { this.ui = ui; } public String getPid() { return pid; } public void setPid(String pid) { this.pid = pid; } public String getUptime() { return uptime; } public void setUptime(String uptime) { this.uptime = uptime; } public String getLaunchTime() { return launchTime; } public void setLaunchTime(String launchTime) { this.launchTime = launchTime; } public String getExecutorMemory() { return executorMemory; } public void setExecutorMemory(String executorMemory) { this.executorMemory = executorMemory; } public String getExecutorInstances() { return executorInstances; } public void setExecutorInstances(String executorInstances) { this.executorInstances = executorInstances; } public String getExecutorCores() { return executorCores; } public void setExecutorCores(String executorCores) { this.executorCores = executorCores; } public String getDriverCores() { return driverCores; } public void setDriverCores(String driverCores) { this.driverCores = driverCores; } public String getDriverMemory() { return driverMemory; } public void setDriverMemory(String driverMemory) { this.driverMemory = driverMemory; } public String getDriverMemoryOverhead() { return driverMemoryOverhead; } public void setDriverMemoryOverhead(String driverMemoryOverhead) { this.driverMemoryOverhead = driverMemoryOverhead; } public String getDriverHost() { return driverHost; } public void setDriverHost(String driverHost) { this.driverHost = driverHost; } public String getDriverPort() { return driverPort; } public void setDriverPort(String driverPort) { this.driverPort = driverPort; } public String getExecutorMemoryOverhead() { return executorMemoryOverhead; } public void setExecutorMemoryOverhead(String executorMemoryOverhead) { this.executorMemoryOverhead = executorMemoryOverhead; } public String getMemory() { return memory; } public void setMemory(String memory) { this.memory = memory; } public String getCpu() { return cpu; } public void setCpu(String cpu) { this.cpu = cpu; } public String getBatchDuration() { return batchDuration; } public void setBatchDuration(String batchDuration) { this.batchDuration = batchDuration; } public String getTimestamp() { return timestamp; } public void setTimestamp(String timestamp) { this.timestamp = timestamp; } public String getRestPort() { return restPort; } public void setRestPort(String restPort) { this.restPort = restPort; } public Map getProperties() { return properties; } public void setProperties(Map properties) { this.properties = properties; } /** * 计算cpu和内存总数 */ public void computeCpuMemory() { this.memory = (Integer.parseInt(this.driverMemory.replace("g", "")) + Integer.parseInt(this.driverMemoryOverhead.replace("g", "")) + Integer.parseInt(this.executorInstances) * (Integer.parseInt(this.executorMemory.replace("g", "")) + Integer.parseInt(this.executorMemoryOverhead.replace("g", "")))) + "g"; this.cpu = (Integer.parseInt(this.executorInstances) * Integer.parseInt(this.executorCores) + Integer.parseInt(this.driverCores)) + ""; } } ================================================ FILE: fire-engines/fire-spark/src/main/java/com/zto/fire/spark/bean/TableMeta.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.bean; /** * 用于封装表的元数据 * @author ChengLong 2019-9-2 13:11:56 */ public class TableMeta { // 表的描述 private String description; // 所在数据库名称 private String database; // 表名 private String tableName; // 表的类型 private String tableType; // 是否为临时表 private Boolean isTemporary; public String getDescription() { return description; } public void setDescription(String description) { this.description = description; } public String getDatabase() { return database; } public void setDatabase(String database) { this.database = database; } public String getTableName() { return tableName; } public void setTableName(String tableName) { this.tableName = tableName; } public String getTableType() { return tableType; } public void setTableType(String tableType) { this.tableType = tableType; } public Boolean getTemporary() { return isTemporary; } public void setTemporary(Boolean temporary) { isTemporary = temporary; } public TableMeta() { } public TableMeta(String description, String database, String tableName, String tableType, Boolean isTemporary) { this.description = description; this.database = database; this.tableName = tableName; this.tableType = tableType; this.isTemporary = isTemporary; } } ================================================ FILE: fire-engines/fire-spark/src/main/java/com/zto/fire/spark/task/SparkSchedulerManager.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.task; import com.zto.fire.core.task.SchedulerManager; import org.apache.spark.SparkEnv; /** * Spark 定时调度任务管理器 * * @author ChengLong * @create 2020-12-18 17:00 * @since 1.0.0 */ public class SparkSchedulerManager extends SchedulerManager { // 单例对象 private static SchedulerManager instance = null; static { instance = new SparkSchedulerManager(); } private SparkSchedulerManager() {} /** * 获取单例实例 */ public static SchedulerManager getInstance() { return instance; } @Override protected String label() { SparkEnv sparkEnv = SparkEnv.get(); if (sparkEnv == null || DRIVER.equalsIgnoreCase(sparkEnv.executorId())) { return DRIVER; } else { return EXECUTOR; } } } ================================================ FILE: fire-engines/fire-spark/src/main/resources/spark-core.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # \u5728yarn-cluster\u6A21\u5F0F\u4E0B\u6CE8\u518Chook\u540E\u6267\u884CSystem.exit\u4F1A\u5BFC\u81F4yarn\u4E0A\u4EFB\u52A1\u975E\u6B63\u5E38\u72B6\u6001\u9000\u51FA\uFF0C\u56E0\u6B64Spark Core\u4EFB\u52A1\u7F6E\u4E3Afalse fire.shutdown.auto.exit = false spark.fire.config_center.enable = false spark.fire.rest.enable = false spark.ui.killEnabled = false spark.port.maxRetries = 200 spark.default.parallelism = 1000 spark.sql.broadcastTimeout = 3000 spark.ui.timeline.tasks.maximum = 300 spark.sql.parquet.writeLegacyFormat = true spark.scheduler.listenerbus.eventqueue.size = 130000 spark.serializer = org.apache.spark.serializer.KryoSerializer # \u8840\u7F18\u89E3\u6790\u5F02\u6B65\u7EBF\u7A0B\u6267\u884C\u7684\u6B21\u6570 fire.lineage.run.count = 360 # \u5B9A\u65F6\u89E3\u6790\u57CB\u70B9SQL\u7684\u521D\u59CB\u5EF6\u8FDF\uFF08s\uFF09 fire.lineage.run.initialDelay = 10 # \u5B9A\u65F6\u89E3\u6790\u57CB\u70B9SQL\u7684\u6267\u884C\u9891\u7387\uFF08s\uFF09 fire.lineage.run.period = 10 ================================================ FILE: fire-engines/fire-spark/src/main/resources/spark-streaming.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # spark streaming的remember时间,-1表示不生效(ms) spark.streaming.remember = -1 spark.fire.hbase.scan.partitions = 20 # spark streaming批次时间,可覆盖代码中所指定的时间 # spark.streaming.batch.duration = # 用于在消费多个topic时区分实例 # spark.rocket.consumer.instance = driver # 以下是Spark引擎调优参数 spark.port.maxRetries = 200 spark.ui.retainedJobs = 500 spark.ui.killEnabled = false spark.ui.retailedStages = 300 spark.default.parallelism = 300 spark.sql.broadcastTimeout = 3000 spark.streaming.concurrentJobs = 1 spark.ui.timeline.tasks.maximum = 300 # 任务通过提交脚本提交到yarn后主动退出提交脚本进程,降低提交节点资源占用(注:此配置需要放到spark-default或提交任务通过--conf指定才会生效) spark.yarn.submit.waitAppCompletion = false spark.sql.parquet.writeLegacyFormat = true spark.streaming.backpressure.enabled = true spark.streaming.stopGracefullyOnShutdown = true spark.serializer = org.apache.spark.serializer.KryoSerializer ================================================ FILE: fire-engines/fire-spark/src/main/resources/spark.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # ----------------------------------------------- < fire \u914D\u7F6E > ------------------------------------------------ # # flink \u5F15\u64CE arthas\u542F\u52A8\u5668\u7C7B\u540D fire.analysis.arthas.launcher = com.zto.fire.spark.plugin.SparkArthasLauncher # \u4E3B\u952E\u914D\u7F6E\u6620\u5C04\u7BA1\u7406\u7C7B fire.conf.anno.manager.class = com.zto.fire.spark.conf.SparkAnnoManager # ----------------------------------------------- < spark \u914D\u7F6E > ----------------------------------------------- # # spark\u7684\u5E94\u7528\u540D\u79F0\uFF0C\u4E3A\u7A7A\u5219\u53D6\u7C7B\u540D spark.appName = # spark local\u6A21\u5F0F\u4E0B\u4F7F\u7528\u591A\u5C11core\u8FD0\u884C\uFF0C\u9ED8\u8BA4\u4E3Alocal[*]\uFF0C\u81EA\u52A8\u6839\u636E\u5F53\u524Dpc\u7684cpu\u6838\u5FC3\u6570\u8BBE\u7F6E spark.local.cores = * # spark checkpoint\u76EE\u5F55\u5730\u5740 spark.chkpoint.dir = hdfs://appcluster/user/spark/ckpoint/ # \u9ED8\u8BA4\u7684spark\u65E5\u5FD7\u7EA7\u522B spark.log.level = WARN spark.redaction.regex = (?i)secret|password|map|address|namenode|connection|metastore spark.fire.scheduler.blacklist = jvmMonitor # \u6307\u5B9A\u5728spark\u5F15\u64CE\u4E0B\uFF0C\u53EF\u8FDB\u884C\u914D\u7F6E\u540C\u6B65\u7684\u5B50\u7C7B\u5B9E\u73B0 spark.fire.conf.deploy.engine = com.zto.fire.spark.sync.SyncSparkEngine # \u662F\u5426\u542F\u7528sql\u6269\u5C55\u7528\u4E8E\u8840\u7F18\u91C7\u96C6 spark.fire.sql.extensions.enable = true # stage \u5931\u8D25\u7684\u6700\u5927\u6B21\u6570\uFF0C\u5C0F\u4E8E\u7B49\u4E8E\u96F6\u8868\u793A\u4E0D\u5F00\u542F spark.fire.stage.maxFailures = -1 # ----------------------------------------------- < kafka \u914D\u7F6E > ----------------------------------------------- # # kafka\u7684groupid\uFF0C\u4E3A\u7A7A\u5219\u53D6\u7C7B\u540D spark.kafka.group.id = # bigdata\u8868\u793A\u8FDE\u63A5\u5927\u6570\u636E\u7684kafka\uFF0Czms\u8868\u793A\u8FDE\u63A5zms\u7684kafka\u96C6\u7FA4 # spark.kafka.brokers.name = bigdata # topic\u5217\u8868 spark.kafka.topics = # \u7528\u4E8E\u914D\u7F6E\u542F\u52A8\u65F6\u7684\u6D88\u8D39\u4F4D\u70B9\uFF0C\u9ED8\u8BA4\u53D6\u6700\u65B0 spark.kafka.starting.offsets = latest # \u6570\u636E\u4E22\u5931\u65F6\u6267\u884C\u5931\u8D25 spark.kafka.failOnDataLoss = true # \u662F\u5426\u542F\u7528\u81EA\u52A8commit spark.kafka.enable.auto.commit = false # \u4EE5spark.kafka.conf\u5F00\u5934\u7684\u914D\u7F6E\u652F\u6301\u6240\u6709kafka client\u7684\u914D\u7F6E #spark.kafka.conf.session.timeout.ms = 300000 #spark.kafka.conf.request.timeout.ms = 400000 # ----------------------------------------------- < hive \u914D\u7F6E > ------------------------------------------------ # # hive \u96C6\u7FA4\u540D\u79F0\uFF08batch\u79BB\u7EBFhive/streaming 180\u96C6\u7FA4hive/test\u672C\u5730\u6D4B\u8BD5hive\uFF09\uFF0C\u7528\u4E8Espark\u8DE8\u96C6\u7FA4\u8BFB\u53D6hive\u5143\u6570\u636E\u4FE1\u606F spark.hive.cluster = # \u4EE5spark.hive.conf.\u4E3A\u524D\u7F00\u7684\u914D\u7F6E\u5C06\u76F4\u63A5\u751F\u6548\uFF0C\u6BD4\u5982\u5F00\u542Fhive\u52A8\u6001\u5206\u533A # this.spark.sql("set hive.exec.dynamic.partition=true") #spark.hive.conf.hive.exec.dynamic.partition = true # spark.sqlContext.sql("set hive.exec.dynamic.partition.mode=nonstrict") #spark.hive.conf.hive.exec.dynamic.partition.mode = nonstrict #spark.hive.conf.hive.exec.max.dynamic.partitions = 5000 # ----------------------------------------------- < HBase \u914D\u7F6E > ----------------------------------------------- # # \u7528\u4E8E\u533A\u5206\u4E0D\u540C\u7684hbase\u96C6\u7FA4: batch/streaming/old/test spark.hbase.cluster = # --------------------------------------------- < RocketMQ \u914D\u7F6E > ---------------------------------------------- # spark.rocket.cluster.map.test = rocket01:9876;rocket02:9876 # \u4EE5spark.rocket.conf\u5F00\u5934\u7684\u914D\u7F6E\u652F\u6301\u6240\u6709rocket client\u7684\u914D\u7F6E #spark.rocket.conf.pull.max.speed.per.partition = 5000 # ----------------------------------------------- < impala \u914D\u7F6E > ---------------------------------------------- # spark.impala.connection.url = jdbc:hive2://hive-server:21050/;auth=noSasl spark.impala.jdbc.driver.class.name = org.apache.hive.jdbc.HiveDriver # ----------------------------------------------- < spark \u53C2\u6570 > ----------------------------------------------- # # Spark\u76F8\u5173\u4F18\u5316\u53C2\u6570\u5217\u5728\u4E0B\u9762\u4F1A\u81EA\u52A8\u88ABfire\u52A0\u8F7D\u751F\u6548 ================================================ FILE: fire-engines/fire-spark/src/main/resources/structured-streaming.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # spark.port.maxRetries = 200 spark.ui.killEnabled = false spark.default.parallelism = 1000 spark.sql.broadcastTimeout = 3000 spark.ui.timeline.tasks.maximum = 300 spark.scheduler.listenerbus.eventqueue.size = 130000 spark.serializer = org.apache.spark.serializer.KryoSerializer ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/BaseSpark.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark import com.zto.fire._ import com.zto.fire.common.conf.{FireFrameworkConf, FireHDFSConf, FireHiveConf} import com.zto.fire.common.util.{OSUtils, PropUtils, SQLUtils} import com.zto.fire.core.BaseFire import com.zto.fire.core.rest.RestServerManager import com.zto.fire.spark.acc.AccumulatorManager import com.zto.fire.spark.conf.FireSparkConf import com.zto.fire.spark.listener.FireSparkListener import com.zto.fire.spark.rest.SparkSystemRestful import com.zto.fire.spark.sql.SqlExtensions import com.zto.fire.spark.task.{SparkInternalTask, SparkSchedulerManager} import com.zto.fire.spark.util.{SparkSingletonFactory, SparkUtils} import org.apache.commons.lang3.StringUtils import org.apache.spark.scheduler.SparkListener import org.apache.spark.sql.catalog.Catalog import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession} import org.apache.spark.streaming.{StreamingContext, StreamingContextState} import org.apache.spark.{SparkConf, SparkContext} import scala.util.Try /** * Spark通用父类 * Created by ChengLong on 2018-03-06. */ trait BaseSpark extends SparkListener with BaseFire with Serializable { private[fire] var _conf: SparkConf = _ protected[fire] var _spark: SparkSession = _ protected lazy val spark, fire: SparkSession = _spark protected lazy val sql = this.executeSql _ protected[fire] var sc: SparkContext = _ protected[fire] var catalog: Catalog = _ protected[fire] var ssc: StreamingContext = _ protected[fire] var hiveContext, sqlContext: SQLContext = _ protected[fire] val acc = AccumulatorManager protected[fire] var batchDuration: Long = _ protected[fire] var listener: SparkListener = _ protected[fire] var taskSchedule: SparkInternalTask = _ /** * 生命周期方法:初始化fire框架必要的信息 * 注:该方法会同时在driver端与executor端执行 */ override private[fire] final def boot: Unit = { // 进Driver端进行引擎配置与用户配置的加载,executor端会通过fire进行分发,应避免在executor端加载引擎和用户配置文件 if (SparkUtils.isDriver) { this.loadConf PropUtils.load(FireFrameworkConf.userCommonConf: _*) //.loadJobConf(this.getClass.getName) this.restfulRegister = new RestServerManager().startRestPort() this.systemRestful = new SparkSystemRestful(this) // 注册到实时平台,并覆盖配置信息 PropUtils.loadJobConf(this.getClass.getName) } PropUtils.setProperty(FireFrameworkConf.DRIVER_CLASS_NAME, this.className) if (StringUtils.isNotBlank(FireSparkConf.appName)) { this.appName = FireSparkConf.appName } SparkSingletonFactory.setAppName(this.appName) super.boot this.logger.info("<-- 完成fire框架初始化 -->") } /** * 生命周期方法:用于关闭SparkContext */ override final def stop: Unit = { if (noEmpty(this._spark, this.sc) && !this.sc.isStopped) { this._spark.stop() } } /** * 生命周期方法:进行fire框架的资源回收 * 注:不允许子类覆盖 */ override protected[fire] final def shutdown(stopGracefully: Boolean = true, inListener: Boolean = false): Unit = { try { this.logger.info("<-- 完成用户资源回收 -->") if (!inListener) { // 事件监听器中无法进行上下文的关闭 if (this.sqlContext != null) this.sqlContext.clearCache if (this.ssc != null && this.ssc.getState() == StreamingContextState.ACTIVE) { this.ssc.stop(true, stopGracefully) this.ssc = null this.sc = null } if (this.sc != null && !this.sc.isStopped) { this.sc.stop() this.sc = null } } } finally { super.shutdown(stopGracefully) } } /** * 构建或合并SparkConf * 注:不同的子类需根据需要复写该方法 * * @param conf * 在conf基础上构建 * @return * 合并后的SparkConf对象 */ def buildConf(conf: SparkConf): SparkConf = { if (conf == null) new SparkConf().setAppName(this.appName) else conf } /** * 构建一系列context对象 */ override private[fire] final def createContext(conf: Any): Unit = { // 构建SparkConf信息 val tmpConf = if (conf == null) this.buildConf(null) else conf.asInstanceOf[SparkConf] tmpConf.setAll(PropUtils.settings) tmpConf.set("spark.driver.class.simple.name", this.driverClass) // 设置hive metastore地址 val hiveMetastoreUrl = FireHiveConf.getMetastoreUrl if (StringUtils.isBlank(hiveMetastoreUrl)) this.logger.warn("当前任务未指定hive连接信息,将不会连接hive metastore。如需使用hive,请通过spark.hive.cluster=xxx指定。") if (StringUtils.isNotBlank(hiveMetastoreUrl)) { tmpConf.set("hive.metastore.uris", hiveMetastoreUrl) // 关联所连接的hive集群,根据预制方案启用HDFS HA FireHDFSConf.hdfsHAConf.foreach(t => tmpConf.set(t._1, t._2)) } // 构建SparkSession对象 val sessionBuilder = SparkSession.builder().config(tmpConf) if (StringUtils.isNotBlank(hiveMetastoreUrl)) sessionBuilder.enableHiveSupport() // 自定义Sql解析器扩展 SqlExtensions.sqlExtension(sessionBuilder) // 在mac或windows环境下执行local模式,cpu数通过spark.local.cores指定,默认local[*] if (OSUtils.isLocal) sessionBuilder.master(s"local[${FireSparkConf.localCores}]") this._spark = sessionBuilder.getOrCreate() // 将当前spark conf中所有的配置信息同步给PropUtils PropUtils.setProperties(this._spark.conf.getAll) PropUtils.show() SparkSingletonFactory.setSparkSession(this._spark) this._spark.registerUDF() this.sc = this._spark.sparkContext this.catalog = this._spark.catalog this.sc.setLogLevel(FireSparkConf.logLevel) this.listener = new FireSparkListener(this) this.sc.addSparkListener(listener) // this.initLogging(this.className) this.hiveContext = this._spark.sqlContext this.sqlContext = this.hiveContext this.applicationId = SparkUtils.getApplicationId this.webUI = SparkUtils.getWebUI(this._spark) this._conf = tmpConf this.deployConf this.logger.info("<-- 完成Spark运行时信息初始化 -->") SparkUtils.executeHiveConfSQL(this._spark) } /** * 用于fire框架初始化,传递累加器与配置信息到executor端 */ override protected def deployConf: Unit = { if (!FireFrameworkConf.deployConf) return // 向driver和executor注册定时任务 this.taskSchedule = new SparkInternalTask(this) // driver端注册定时任务 SparkSchedulerManager.getInstance().registerTasks(this, this.taskSchedule, this.listener) // executor端与自定义累加器一同完成定时任务注册 AccumulatorManager.registerTasks(this.taskSchedule) if (isObject(this.getClass)) AccumulatorManager.registerTasks(this) // 向executor端注册自定义累加器 if (FireFrameworkConf.accEnable) this.acc.registerAccumulators(this.sc) } /** * 用于注册定时任务实例 * * @param instances * 标记有@Scheduled类的实例 */ def registerSchedule(instances: Object*): Unit = { try { // 向driver端注册定时任务 SparkSchedulerManager.getInstance().registerTasks(instances: _*) // 向executor端注册定时任务 val executors = this._conf.get("spark.executor.instances").toInt if (executors > 0 && this.sc != null) { this.sc.parallelize(1 to executors, executors).foreachPartition(i => SparkSchedulerManager.getInstance().registerTasks(instances: _*)) } } catch { case e: Throwable => this.logger.error("定时任务注册失败.", e) } } /** * 获取任务的resourceId * * @return * spark任务:driver/id flink任务:JobManager/container_xxx */ override protected def resourceId: String = { val resourceId = SparkUtils.getExecutorId if (StringUtils.isBlank(resourceId) || "driver".equals(resourceId)) "driver" else s"container_${resourceId}" } /** * SQL语法校验 * * @param sql * sql statement * @return * true:校验成功 false:校验失败 */ override def sqlValidate(sql: JString): Try[Unit] = SparkUtils.sqlValidate(sql) /** * SQL语法校验 * @param sql * sql statement * @return * true:校验成功 false:校验失败 */ override def sqlLegal(sql: JString): Boolean = SparkUtils.sqlLegal(sql) /** * 执行多条sql语句,以分号分割 */ private[this] def executeSql(sql: String): DataFrame = { SQLUtils.executeSql(sql) (statement => _spark.sql(statement)).get } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/BaseSparkBatch.scala ================================================ package com.zto.fire.spark /** * Spark core通用父接口 * Created by ChengLong on 2018-03-28. */ trait BaseSparkBatch extends BaseSparkCore { } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/BaseSparkCore.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.enu.JobType import com.zto.fire.common.util.PropUtils /** * Spark core通用父接口 * Created by ChengLong on 2018-03-28. */ class BaseSparkCore extends BaseSpark { override val jobType = JobType.SPARK_CORE /** * 程序初始化方法,用于初始化必要的值 * * @param conf * Spark配置信息 */ override def init(conf: Any = null, args: Array[String] = null): Unit = { super.init(conf, args) this.processAll } /** * 在加载任务配置文件前将被加载 */ override private[fire] def loadConf: Unit = { PropUtils.load(FireFrameworkConf.SPARK_CORE_CONF_FILE) } /** * Spark处理逻辑 * 注:此方法会被自动调用,不需要在main中手动调用 */ override def process: Unit = { // 子类复写该方法实现业务处理逻辑 } /** * 初始化SparkSession对象 */ override def main(args: Array[String]): Unit = { super.main(args) this.stop } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/BaseSparkStreaming.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark import com.zto.fire.common.anno.Rest import com.zto.fire.common.bean.rest.ResultMsg import com.zto.fire.common.conf.{FireFrameworkConf, FireKafkaConf} import com.zto.fire.common.enu.{ErrorCode, JobType, RequestMethod} import com.zto.fire.common.util.{JSONUtils, KafkaUtils, PropUtils, ReflectionUtils} import com.zto.fire.core.rest.RestCase import com.zto.fire.spark.bean.RestartParams import com.zto.fire.spark.util.{SparkSingletonFactory, SparkUtils} import org.apache.spark.SparkConf import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext, StreamingContextState} import spark.{Request, Response} import com.zto.fire._ import com.zto.fire.spark.conf.FireSparkConf /** * Spark Streaming通用父接口 * Created by ChengLong on 2018-03-28. */ trait BaseSparkStreaming extends BaseSpark { var checkPointDir: String = _ var externalConf: RestartParams = _ override val jobType = JobType.SPARK_STREAMING /** * 程序初始化方法,用于初始化必要的值 * * @param batchDuration * Streaming每个批次间隔时间 * @param isCheckPoint * 是否做checkpoint */ def init(batchDuration: Long, isCheckPoint: Boolean): Unit = { this.init(batchDuration, isCheckPoint, null) } /** * 程序初始化方法,用于初始化必要的值 * * @param batchDuration * Streaming每个批次间隔时间 * @param isCheckPoint * 是否做checkpoint */ def init(batchDuration: Long, isCheckPoint: Boolean, args: Array[String]): Unit = { this.init(batchDuration, isCheckPoint, null, args) if (FireFrameworkConf.jobAutoStart && this.ssc.getState() == StreamingContextState.INITIALIZED) this.fire.start } /** * 程序初始化方法,用于初始化必要的值 * * @param batchDuration * Streaming每个批次间隔时间 * @param isCheckPoint * 是否做checkpoint * @param conf * 传入自己构建的sparkConf对象,可以为空 */ def init(batchDuration: Long, isCheckPoint: Boolean, conf: SparkConf, args: Array[String]): Unit = { val tmpConf = buildConf(conf) if (this.sc == null) { // 添加streaming相关的restful接口,并启动 this.init(tmpConf, args) this.restfulRegister .addRest(RestCase(RequestMethod.POST.toString, "/system/streaming/hotRestart", this.hotRestart)) .startRestServer } // 判断是否为热重启,batchDuration优先级分别为 [ 代码<配置文件<热重启 ] this.batchDuration = SparkUtils.overrideBatchDuration(batchDuration, this.externalConf != null) if (!isCheckPoint) { if (this.externalConf != null && this.externalConf.isRestartSparkContext) { // 重启SparkContext对象 this.ssc = new StreamingContext(tmpConf, Seconds(Math.abs(this.batchDuration))) this.sc = this.ssc.sparkContext } else { this.ssc = new StreamingContext(this.sc, Seconds(Math.abs(this.batchDuration))) } val rememberTime = FireSparkConf.streamingRemember if (rememberTime > 0) this.ssc.remember(Milliseconds(Math.abs(rememberTime))) SparkSingletonFactory.setStreamingContext(this.ssc) this.processAll } else { this.checkPointDir = FireSparkConf.chkPointDirPrefix + this.appName this.ssc = StreamingContext.getOrCreate(this.checkPointDir, createStreamingContext _) // 初始化Streaming def createStreamingContext(): StreamingContext = { tmpConf.set("spark.streaming.receiver.writeAheadLog.enable", "true") if (this.externalConf != null && this.externalConf.isRestartSparkContext) { // 重启SparkContext对象 this.ssc = new StreamingContext(tmpConf, Seconds(Math.abs(this.batchDuration))) this.sc = this.ssc.sparkContext } else { this.ssc = new StreamingContext(this.sc, Seconds(Math.abs(this.batchDuration))) } this.ssc.checkpoint(checkPointDir) SparkSingletonFactory.setStreamingContext(this.ssc) this.processAll this.ssc } } this._conf = tmpConf } /** * 构建内部使用的SparkConf对象 */ override def buildConf(conf: SparkConf = null): SparkConf = { val tmpConf = super.buildConf(conf) // 若重启SparkContext对象,则设置restful传递过来的新的配置信息 if (this.externalConf != null && this.externalConf.isRestartSparkContext) { if (this.externalConf.getSparkConf != null && this.externalConf.getSparkConf.size() > 0) { tmpConf.setAll(this.externalConf.getSparkConf) } } tmpConf } /** * 在加载任务配置文件前将被加载 */ override private[fire] def loadConf: Unit = { PropUtils.load(FireFrameworkConf.SPARK_STREAMING_CONF_FILE) } /** * 初始化SparkSession与StreamingContext,默认批次时间为30s * 批次时间可通过子类复写main方法实现或通过在配置文件中指定:spark.streaming.batch.duration=30 */ override def main(args: Array[String]): Unit = { val batchDuration = this.conf.getLong("spark.streaming.batch.duration", 10) val ck = this.conf.getBoolean("spark.streaming.receiver.writeAheadLog.enable", false) this.init(batchDuration, ck, args) } /** * Streaming的处理过程强烈建议放到process中,保持风格统一 * 注:此方法会被自动调用,在以下两种情况下,必须将逻辑写在process中 * 1. 开启checkpoint * 2. 支持streaming热重启(可在不关闭streaming任务的前提下修改batch时间) */ override def process: Unit = { require(this.checkPointDir == null, "当开启checkPoint机制时,必须将对接kafka的代码写在process方法内") require(this.externalConf == null, "当需要使用热重启功能时,必须将对接kafka的代码写在process方法内") } /** * kafka配置信息 * * @param groupId * 消费组 * @param offset * offset位点,smallest、largest,默认为largest * @return * kafka相关配置 */ @Deprecated def kafkaParams(groupId: String = this.appName, kafkaBrokers: String = null, offset: String = FireKafkaConf.offsetLargest, autoCommit: Boolean = false, keyNum: Int = 1): Map[String, Object] = { KafkaUtils.kafkaParams(null, groupId, kafkaBrokers, offset, autoCommit, keyNum) } /** * 用于重置StreamingContext(仅支持batch时间的修改) * * @return * 响应结果 */ @Rest("/system/streaming/hotRestart") def hotRestart(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/streaming/hotRestart") this.externalConf = JSONUtils.parseObject[RestartParams](json) new Thread(new Runnable { override def run(): Unit = { ssc.stop(externalConf.isRestartSparkContext, externalConf.isStopGracefully) init(externalConf.getBatchDuration, externalConf.isCheckPoint) } }).start() this.logger.info(s"[hotRestart] 执行热重启成功:duration=${this.externalConf.getBatchDuration} json=$json", "rest") ResultMsg.buildSuccess(s"执行热重启成功:duration=${this.externalConf.getBatchDuration}", ErrorCode.SUCCESS.toString) } catch { case e: Exception => { this.logger.error(s"[hotRestart] 执行热重启失败:json=$json", e) ResultMsg.buildError("执行热重启失败", ErrorCode.ERROR) } } } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/BaseStructuredStreaming.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.enu.JobType import com.zto.fire.common.util.PropUtils import com.zto.fire.spark.listener.FireStreamingQueryListener /** * Structured Streaming通用父类 * Created by ChengLong on 2019-03-11. */ trait BaseStructuredStreaming extends BaseSpark { override val jobType = JobType.SPARK_STRUCTURED_STREAMING /** * 程序初始化方法,用于初始化必要的值 * * @param conf * Spark配置信息 * @param args main方法参数 */ override def init(conf: Any = null, args: Array[String] = null): Unit = { super.init(conf, args) // 添加时间监听器 this._spark.streams.addListener(new FireStreamingQueryListener) this.restfulRegister.startRestServer this.process } /** * Spark处理逻辑 * 注:此方法会被自动调用,不需要在main中手动调用 */ override def process: Unit = { // 子类复写该方法实现业务处理逻辑 } /** * 在加载任务配置文件前将被加载 */ override private[fire] def loadConf: Unit = { PropUtils.load(FireFrameworkConf.SPARK_STRUCTURED_STREAMING_CONF_FILE) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/SparkBatch.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark /** * Spark core通用父接口 * Created by ChengLong on 2018-03-28. */ trait SparkBatch extends BaseSparkCore { } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/SparkCore.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark /** * Spark core通用父接口 * Created by ChengLong on 2018-03-28. */ trait SparkCore extends BaseSparkCore { } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/SparkStreaming.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark /** * Spark Streaming通用父接口 * Created by ChengLong on 2018-03-28. */ trait SparkStreaming extends BaseSparkStreaming { } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/StructuredStreaming.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark /** * Structured Streaming通用父类 * Created by ChengLong on 2019-03-11. */ trait StructuredStreaming extends BaseStructuredStreaming { } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/acc/AccumulatorManager.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.acc import com.zto.fire.predef._ import com.google.common.collect.HashBasedTable import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.conf.FireFrameworkConf.{lineageRunCount, lineageRunInitialDelay, lineageRunPeriod} import com.zto.fire.common.enu.{Datasource, ThreadPoolType} import com.zto.fire.common.util._ import com.zto.fire.spark.sync.DistributeSyncManager import com.zto.fire.spark.task.SparkSchedulerManager import com.zto.fire.spark.util.SparkUtils import org.apache.commons.lang3.StringUtils import org.apache.spark.broadcast.Broadcast import org.apache.spark.util.LongAccumulator import org.apache.spark.{SparkConf, SparkContext, SparkEnv} import java.nio.ByteBuffer import java.util.concurrent.atomic.AtomicInteger import java.util.concurrent.{ConcurrentHashMap, ConcurrentLinkedQueue, ScheduledExecutorService, TimeUnit} import scala.collection.mutable /** * fire内置Spark累加器工具类 * * @author ChengLong 2019-7-25 19:11:16 */ private[fire] object AccumulatorManager extends Logging { private lazy val executorId = SparkUtils.getExecutorId // 累加器名称,含有fire的名字将会显示在webui中 private[this] val counterLabel = "fire-counter" private[fire] val counter = new LongAccumulator // String累加器 private[this] val stringAccumulatorLabel = "stringAccumulator" private[fire] val stringAccumulator = new StringAccumulator // 血缘累加器 private[this] val lineageAccumulatorLabel = "lineageAccumulator" private[fire] val lineageAccumulator = new LineageAccumulator // 同步累加器 private[this] val syncAccumulatorLabel = "syncAccumulator" private[fire] val syncAccumulator = new SyncAccumulator // 日志累加器 private[this] val logAccumulatorLabel = "logAccumulator" private[fire] val logAccumulator = new LogAccumulator // 多值累加器 private[this] val multiCounterLabel = "fire-multiCounter" private[fire] val multiCounter = new MultiCounterAccumulator // timer累加器 private[this] val multiTimerLabel = "multiTimer" private[fire] val multiTimer = new MultiTimerAccumulator // env累加器 private[this] val envAccumulatorLabel = "envAccumulator" private[fire] val envAccumulator = new EnvironmentAccumulator // 累加器注册列表 private[this] val accMap = Map(this.lineageAccumulatorLabel -> this.lineageAccumulator, this.syncAccumulatorLabel -> this.syncAccumulator, this.stringAccumulatorLabel -> this.stringAccumulator, this.logAccumulatorLabel -> this.logAccumulator, this.counterLabel -> this.counter, this.multiCounterLabel -> this.multiCounter, this.multiTimerLabel -> this.multiTimer, this.envAccumulatorLabel -> this.envAccumulator) // 获取当前任务的全类名 private[this] lazy val jobClassName = SparkEnv.get.conf.get(FireFrameworkConf.DRIVER_CLASS_NAME, "") // 用于注册定时任务的列表 private[this] val taskRegisterSet = mutable.HashSet[Object]() // 用于广播spark配置信息 private[fire] var broadcastConf: Broadcast[SparkConf] = _ // 用于解析数据源的异步定时调度线程 private lazy val lineageThread = ThreadUtils.createThreadPool("LineageAccumulator", ThreadPoolType.SCHEDULED).asInstanceOf[ScheduledExecutorService] // 用于记录血缘解析运行的次数 private lazy val lineageRunCount = new AtomicInteger() /** * 注册定时任务实例 */ def registerTasks(tasks: Object*): Unit = { if (tasks != null) { tasks.foreach(taskInstances => taskRegisterSet.add(taskInstances)) } } /** * 将数据累加到count累加器中 * * @param value * 累加值 */ def addCounter(value: Long): Unit = { if (FireUtils.isSparkEngine) { if (SparkEnv.get != null && !"driver".equalsIgnoreCase(SparkEnv.get.executorId)) { val countAccumulator = SparkEnv.get.conf.get(this.counterLabel, "") if (StringUtils.isNotBlank(countAccumulator)) { val counter: LongAccumulator = SparkEnv.get.closureSerializer.newInstance.deserialize(ByteBuffer.wrap(StringsUtils.toByteArray(countAccumulator))) counter.add(value) } } else { this.counter.add(value) } } } /** * 获取counter累加器的值 * * @return * 累加结果 */ def getCounter: Long = this.counter.value /** * 将timeCost累加到日志累加器中 * * @param log * TimeCost实例对象 */ def addLog(log: String): Unit = { if (isEmpty(log)) return if (FireUtils.isSparkEngine) { val env = SparkEnv.get if (env != null && !"driver".equalsIgnoreCase(SparkEnv.get.executorId)) { val logAccumulator = SparkEnv.get.conf.get(this.logAccumulatorLabel, "") if (StringUtils.isNotBlank(logAccumulator)) { val logAcc: LogAccumulator = SparkEnv.get.closureSerializer.newInstance.deserialize(ByteBuffer.wrap(StringsUtils.toByteArray(logAccumulator))) logAcc.add(log) } } else { this.logAccumulator.add(log) } } } /** * 将系统信息累加到同步累加器中 * * @param json * 通信消息 */ private[fire] def addSync(json: String): Unit = { if (isEmpty(json)) return if (FireUtils.isSparkEngine) { val env = SparkEnv.get if (env != null && !"driver".equalsIgnoreCase(SparkEnv.get.executorId)) { val syncAccumulator = SparkEnv.get.conf.get(this.syncAccumulatorLabel, "") if (StringUtils.isNotBlank(syncAccumulator)) { val syncAcc: SyncAccumulator = SparkEnv.get.closureSerializer.newInstance.deserialize(ByteBuffer.wrap(StringsUtils.toByteArray(syncAccumulator))) syncAcc.add(json) } } else { this.syncAccumulator.add(json) } } } /** * 将血缘信息添加到累加器中 */ private[fire] def addLineage(lineageMap: JConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]]): Unit = { if (isEmpty(lineageMap)) return if (FireUtils.isSparkEngine) { val env = SparkEnv.get if (env != null && !"driver".equalsIgnoreCase(SparkEnv.get.executorId)) { val lineageAccumulator = SparkEnv.get.conf.get(this.lineageAccumulatorLabel, "") if (StringUtils.isNotBlank(lineageAccumulator)) { val lineageAcc: LineageAccumulator = SparkEnv.get.closureSerializer.newInstance.deserialize(ByteBuffer.wrap(StringsUtils.toByteArray(lineageAccumulator))) lineageAcc.add(lineageMap) } } else { this.lineageAccumulator.add(lineageMap) } } } /** * 将字符串等累加到String累加器中 * * @param str * 字符串(json) */ def addString(str: String): Unit = { if (isEmpty(str)) return if (FireUtils.isSparkEngine) { val env = SparkEnv.get if (env != null && !"driver".equalsIgnoreCase(SparkEnv.get.executorId)) { val stringAccumulator = SparkEnv.get.conf.get(this.stringAccumulatorLabel, "") if (StringUtils.isNotBlank(stringAccumulator)) { val logAcc: StringAccumulator = SparkEnv.get.closureSerializer.newInstance.deserialize(ByteBuffer.wrap(StringsUtils.toByteArray(stringAccumulator))) logAcc.add(str) } } else { this.stringAccumulator.add(str) } } } /** * 添加异常堆栈日志到累加器中 * * @param exceptionList * 堆栈列表 */ def addExceptionLog(exceptionList: List[(String, Throwable)], count: Long): Unit = { exceptionList.foreach(t => this.addLog(exceptionStack(t))) /** * 转换throwable为堆栈信息 */ def exceptionStack(exceptionTuple: (String, Throwable)): String = { s""" |异常信息<< ip:${OSUtils.getIp} executorId:${executorId} 异常时间:${exceptionTuple._1} 累计:${count}次. >> |异常堆栈:${ExceptionBus.stackTrace(exceptionTuple._2)} |""".stripMargin } } /** * 获取日志累加器中的值 * * @return * 日志累加值 */ def getLog: ConcurrentLinkedQueue[String] = this.logAccumulator.value /** * 获取字符串累加器中的值 * * @return * 日志累加值 */ def getString: ConcurrentLinkedQueue[String] = this.stringAccumulator.value /** * 获取系统同步累加器中的值 * * @return * 日志累加值 */ def getSync: ConcurrentLinkedQueue[String] = this.syncAccumulator.value /** * 获取Fire采集到的血缘信息 */ def getLineage: JConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]] = this.lineageAccumulator.value /** * 将运行时信息累加到env累加器中 * * @param envInfo * 运行时信息 */ def addEnv(envInfo: String): Unit = { if (FireUtils.isSparkEngine) { val env = SparkEnv.get if (env != null && !"driver".equalsIgnoreCase(SparkEnv.get.executorId)) { val envAccumulator = SparkEnv.get.conf.get(this.envAccumulatorLabel, "") if (StringUtils.isNotBlank(envAccumulator)) { val envAcc: EnvironmentAccumulator = SparkEnv.get.closureSerializer.newInstance.deserialize(ByteBuffer.wrap(StringsUtils.toByteArray(envAccumulator))) envAcc.add(envInfo) } } else { this.envAccumulator.add(envInfo) } } } /** * 获取env累加器中的运行时信息 * * @return * 运行时信息 */ def getEnv: ConcurrentLinkedQueue[String] = this.envAccumulator.value /** * 将数据累加到multiCount累加器中 * * @param value * 累加值 */ def addMultiCounter(key: String, value: Long): Unit = { if (FireUtils.isSparkEngine) { if (SparkEnv.get != null && !"driver".equalsIgnoreCase(SparkEnv.get.executorId)) { val countAccumulator = SparkEnv.get.conf.get(this.multiCounterLabel, "") if (StringUtils.isNotBlank(countAccumulator)) { val multiCounter: MultiCounterAccumulator = SparkEnv.get.closureSerializer.newInstance.deserialize(ByteBuffer.wrap(StringsUtils.toByteArray(countAccumulator))) multiCounter.add(key, value) } } else { this.multiCounter.add(key, value) } } } /** * 获取multiCounter累加器的值 * * @return * 累加结果 */ def getMultiCounter: ConcurrentHashMap[String, Long] = this.multiCounter.value /** * 将数据累加到timer累加器中 * * @param value * 累加值的key、value和时间的schema,默认为yyyy-MM-dd HH:mm:00 */ def addMultiTimer(key: String, value: Long, schema: String = DateFormatUtils.TRUNCATE_MIN): Unit = { if (FireUtils.isSparkEngine) { if (SparkEnv.get != null && !"driver".equalsIgnoreCase(SparkEnv.get.executorId)) { val timerAccumulator = SparkEnv.get.conf.get(this.multiTimerLabel, "") if (StringUtils.isNotBlank(timerAccumulator)) { val multiTimer: MultiTimerAccumulator = SparkEnv.get.closureSerializer.newInstance.deserialize(ByteBuffer.wrap(StringsUtils.toByteArray(timerAccumulator))) multiTimer.add(key, value, schema) } } else { this.multiTimer.add(key, value, schema) } } } /** * 用于构建复杂类型(json)的多时间维度累加器的key * 并将key作为多时间维度累加器的key * * @param value * 累加的值 * @param cluster * 连接的集群名 * @param module * 所在的模块 * @param method * 所在的方法名 * @param action * 执行的动作 * @param sink * 作用的目标 * @param level * 日志级别:INFO、ERROR * @return * 累加器的key(json格式) */ def addMultiTimer(module: String, method: String, action: String, sink: String, level: String, cluster: String, value: Long): Unit = { if (FireUtils.isSparkEngine) { val multiKey = s"""{"cluster":"$cluster","module":"$module","method":"$method","action":"$action","sink":"$sink","level":"$level","jobClass":"$jobClassName"}""" this.addMultiTimer(multiKey, value) } } /** * 获取timer累加器的值 * * @return * 累加结果 */ def getMultiTimer: HashBasedTable[String, String, Long] = this.multiTimer.value /** * 注册多个自定义累加器到每个executor * * @param sc * SparkContext * [key, accumulator] */ private[fire] def registerAccumulators(sc: SparkContext): Unit = this.synchronized { if (sc != null && accMap != null && accMap.nonEmpty) { // 将定时任务所在类的实例广播到每个executor端 val taskSet = sc.broadcast(taskRegisterSet) val broadcastConf = sc.broadcast(SparkEnv.get.conf) this.broadcastConf = broadcastConf // 序列化内置的累加器 val accumulatorMap = accMap.map(accInfo => { // 注册每个累加器,必须是合法的名称并且未被注册过 if (accInfo._2 != null && !accInfo._2.isRegistered) { if (StringUtils.isNotBlank(accInfo._1) && accInfo._1.contains("fire")) { sc.register(accInfo._2, accInfo._1) } else { sc.register(accInfo._2) } } (accInfo._1, SparkEnv.get.closureSerializer.newInstance().serialize(accInfo._2).array()) }) DistributeSyncManager.sync({ this.broadcastConf = broadcastConf // 将序列化后的累加器放置到conf中 accumulatorMap.foreach(accSer => SparkEnv.get.conf.set(accSer._1, StringsUtils.toHexString(accSer._2))) if (FireFrameworkConf.scheduleEnable) { // 从广播中获取到定时任务的实例,并在executor端完成注册 val tasks = taskSet.value if (tasks != null && tasks.nonEmpty && !SparkSchedulerManager.getInstance().schedulerIsStarted()) { SparkSchedulerManager.getInstance().registerTasks(tasks.toArray: _*) } } }, false) } } /** * 分布式采集血缘依赖 */ private[fire] def collectLineage: Unit = { if (!FireFrameworkConf.accEnable || !FireFrameworkConf.lineageEnable) return this.lineageThread.scheduleWithFixedDelay(new Runnable { override def run(): Unit = { if (SparkUtils.isDriver) { // driver端采集 addLineage(LineageManager.getDatasourceLineage) // executor端分布式采集 DistributeSyncManager.sync({ addLineage(LineageManager.getDatasourceLineage) }) if (lineageRunCount.incrementAndGet() > FireFrameworkConf.lineageRunCount) { logger.info(s"Spark分布式血缘解析与采集任务即将退出,总计运行:${lineageRunCount.get()}次") lineageThread.shutdown() } logger.info(s"完成Spark分布式血缘解析与采集:${lineageRunCount.get()}次") } } }, lineageRunInitialDelay + 10, lineageRunPeriod, TimeUnit.SECONDS) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/acc/EnvironmentAccumulator.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.acc import java.util.concurrent.ConcurrentLinkedQueue import com.zto.fire.common.conf.FireFrameworkConf import org.apache.commons.lang3.StringUtils import org.apache.spark.util.AccumulatorV2 /** * 运行时累加器,用于收集运行时的jvm、gc、thread、cpu、memory、disk等信息 * * @author ChengLong 2019年11月6日 16:56:38 */ private[fire] class EnvironmentAccumulator extends AccumulatorV2[String, ConcurrentLinkedQueue[String]] { // 用于存放运行时信息的队列 private val envInfoQueue = new ConcurrentLinkedQueue[String] // 判断是否打开运行时信息累加器 private lazy val isEnable = FireFrameworkConf.accEnable && FireFrameworkConf.accEnvEnable /** * 判断累加器是否为空 */ override def isZero: Boolean = this.envInfoQueue.size() == 0 /** * 用于复制累加器 */ override def copy(): AccumulatorV2[String, ConcurrentLinkedQueue[String]] = new EnvironmentAccumulator /** * driver端执行有效,用于清空累加器 */ override def reset(): Unit = this.envInfoQueue.clear /** * executor端执行,用于收集运行时信息 * * @param envInfo * 运行时信息 */ override def add(envInfo: String): Unit = { if (this.isEnable && StringUtils.isNotBlank(envInfo)) { this.envInfoQueue.add(envInfo) this.clear } } /** * executor端向driver端merge累加数据 * * @param other * executor端累加结果 */ override def merge(other: AccumulatorV2[String, ConcurrentLinkedQueue[String]]): Unit = { if (other != null && other.value.size() > 0) { this.envInfoQueue.addAll(other.value) this.clear } } /** * driver端获取累加器的值 * * @return * 收集到的日志信息 */ override def value: ConcurrentLinkedQueue[String] = this.envInfoQueue /** * 当日志累积量超过maxLogSize所设定的值时清理过期的日志数据 * 直到达到minLogSize所设定的最小值,防止频繁的进行清理 */ def clear: Unit = { if (this.envInfoQueue.size() > FireFrameworkConf.maxEnvSize) { while (this.envInfoQueue.size() > FireFrameworkConf.minEnvSize) { this.envInfoQueue.poll } } } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/acc/LineageAccumulator.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.acc import com.zto.fire._ import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.enu.Datasource import com.zto.fire.common.util.{DatasourceDesc, LineageManager, Logging} import com.zto.fire.predef.JHashSet import org.apache.spark.util.AccumulatorV2 import java.util.concurrent.{ConcurrentHashMap, ConcurrentLinkedQueue} /** * Fire框架实时血缘累加器,用于采集实时任务用到的数据源信息、SQL血缘信息等 * 支持:SQL、JDBC、Kafka、RocketMQ、HBase等组件的血缘信息解析与采集 * * @author ChengLong 2022-08-29 09:21:48 * @since 2.3.2 */ private[fire] class LineageAccumulator extends AccumulatorV2[ConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]], ConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]]] with Logging { // 用于存放字符串的队列 private val lineageMap = new ConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]]() /** * 判断累加器是否为空 */ override def isZero: Boolean = this.lineageMap.isEmpty /** * 用于复制累加器 */ override def copy(): AccumulatorV2[ConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]], ConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]]] = { val strAcc = new LineageAccumulator // strAcc.value.putAll(this.lineageMap) LineageManager.mergeLineageMap(strAcc.value, this.lineageMap) strAcc } /** * driver端执行有效,用于清空累加器 */ override def reset(): Unit = this.lineageMap.clear() /** * 将新的血缘信息添加到累加器中 */ override def add(v: ConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]]): Unit = { if (FireFrameworkConf.accEnable && v.nonEmpty) { // this.lineageMap.putAll(v) LineageManager.mergeLineageMap(this.lineageMap, v) } } /** * executor端向driver端merge累加数据 * * @param other * executor端累加结果 */ override def merge(other: AccumulatorV2[ConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]], ConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]]]): Unit = { if (other != null && other.value.size() > 0) { this.lineageMap.putAll(other.value) } } /** * driver端获取累加器的值 * * @return * 收集到的日志信息 */ override def value: ConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]] = this.lineageMap } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/acc/LogAccumulator.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.acc import com.zto.fire.common.conf.FireFrameworkConf import org.apache.spark.util.AccumulatorV2 import java.util.concurrent.ConcurrentLinkedQueue /** * fire框架日志累加器 * * @author ChengLong 2019-7-23 14:22:16 */ private[fire] class LogAccumulator extends StringAccumulator { // 判断是否打开日志累加器 override protected lazy val isEnable = FireFrameworkConf.accEnable && FireFrameworkConf.accLogEnable /** * 用于复制累加器 */ override def copy(): AccumulatorV2[String, ConcurrentLinkedQueue[String]] = new LogAccumulator } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/acc/MultiCounterAccumulator.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.acc import java.util.concurrent.ConcurrentHashMap import com.zto.fire._ import com.zto.fire.common.conf.FireFrameworkConf import org.apache.commons.lang3.StringUtils import org.apache.spark.util.AccumulatorV2 /** * 多值累加器 * * @author ChengLong 2019-8-16 16:56:06 */ private[fire] class MultiCounterAccumulator extends AccumulatorV2[(String, Long), ConcurrentHashMap[String, Long]] { private[fire] val multiCounter = new ConcurrentHashMap[String, Long]() // 判断是否打开多值累加器 private lazy val isEnable = FireFrameworkConf.accEnable && FireFrameworkConf.accMultiCounterEnable /** * 用于判断当前累加器是否为空 * * @return * true: 空 false:不为空 */ override def isZero: Boolean = this.multiCounter.size() == 0 /** * 用于复制一个新的累加器实例 * * @return * 新的累加器实例对象 */ override def copy(): AccumulatorV2[(String, Long), ConcurrentHashMap[String, Long]] = { val tmpAcc = new MultiCounterAccumulator tmpAcc.multiCounter.putAll(this.multiCounter) tmpAcc } /** * 用于重置累加器 */ override def reset(): Unit = this.multiCounter.clear /** * 用于添加新的数据到累加器中 * * @param kv * 累加值的key和value */ override def add(kv: (String, Long)): Unit = this.mergeMap(kv) /** * 用于合并数据到累加器的map中 * 存在的累加,不存在的直接添加 * * @param kv * 累加值的key和value */ private[this] def mergeMap(kv: (String, Long)): Unit = { if (this.isEnable && kv != null && StringUtils.isNotBlank(kv._1)) { this.multiCounter.put(kv._1, this.multiCounter.getOrDefault(kv._1, 0) + kv._2) } } /** * 用于合并executor端的map到driver端 * * @param other * executor端的map */ override def merge(other: AccumulatorV2[(String, Long), ConcurrentHashMap[String, Long]]): Unit = { val otherMap = other.value if (otherMap != null && otherMap.nonEmpty) { otherMap.foreach(kv => { this.mergeMap(kv) }) } } /** * 用于driver端获取累加器(map)中的值 * * @return * 累加器中的值 */ override def value: ConcurrentHashMap[String, Long] = this.multiCounter } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/acc/MultiTimerAccumulator.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.acc import com.google.common.collect.HashBasedTable import com.zto.fire._ import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.util.DateFormatUtils import org.apache.commons.lang3.StringUtils import org.apache.spark.util.AccumulatorV2 import java.util.Date import scala.collection.mutable /** * timer累加器,对相同的key进行分钟级维度累加 * * @author ChengLong 2019-8-21 14:22:12 */ private[fire] class MultiTimerAccumulator extends AccumulatorV2[(String, Long, String), HashBasedTable[String, String, Long]] { private[fire] lazy val timerCountTable = HashBasedTable.create[String, String, Long] // 用于记录上次清理过期累加数据的时间 private var lastClearTime = new Date // 判断是否打开多时间维度累加器 private lazy val isEnable = FireFrameworkConf.accEnable && FireFrameworkConf.accMultiCounterEnable /** * 用于判断当前累加器是否为空 * * @return * true: 空 false:不为空 */ override def isZero: Boolean = this.timerCountTable.size() == 0 /** * 用于复制一个新的累加器实例 * * @return * 新的累加器实例对象 */ override def copy(): AccumulatorV2[(String, Long, String), HashBasedTable[String, String, Long]] = { val tmpAcc = new MultiTimerAccumulator tmpAcc.timerCountTable.putAll(this.timerCountTable) tmpAcc } /** * 用于重置累加器 */ override def reset(): Unit = this.timerCountTable.clear /** * 用于添加新的数据到累加器中 * * @param kv * 累加值的key、value和时间的schema,默认为yyyy-MM-dd HH:mm:00 */ override def add(kv: (String, Long, String)): Unit = { if (!isEnable || kv == null) return val schema = if (StringUtils.isBlank(kv._3)) { DateFormatUtils.TRUNCATE_MIN } else kv._3 if (StringUtils.isNotBlank(kv._1)) { this.mergeTable(kv._1, DateFormatUtils.formatCurrentBySchema(schema), kv._2) } } /** * 用于合并数据到累加器的map中 * 存在的累加,不存在的直接添加 * * @param kv * 累加值的key和value */ private[this] def mergeTable(kv: (String, String, Long)): Unit = { if (kv != null && StringUtils.isNotBlank(kv._1) && kv._2 != null) { val value = if (this.timerCountTable.contains(kv._1, kv._2)) this.timerCountTable.get(kv._1, kv._2) else 0L this.timerCountTable.put(kv._1, kv._2, kv._3 + value) this.clear } } /** * 用于合并executor端的map到driver端 * * @param other * executor端的map */ override def merge(other: AccumulatorV2[(String, Long, String), HashBasedTable[String, String, Long]]): Unit = { val otherTable = other.value if (otherTable != null && !otherTable.isEmpty) { otherTable.cellSet().foreach(timer => { this.mergeTable(timer.getRowKey, timer.getColumnKey, timer.getValue) }) } } /** * 用于driver端获取累加器(map)中的值 * * @return * 累加器中的值 */ override def value: HashBasedTable[String, String, Long] = this.timerCountTable /** * 当累积量超过maxTimerSize所设定的值时清理过期的数据 */ private[this] def clear: Unit = { val currentDate = new Date if (this.timerCountTable.size() >= FireFrameworkConf.maxTimerSize && DateFormatUtils.betweenHours(currentDate, lastClearTime) >= FireFrameworkConf.maxTimerHour) { val criticalTime = DateFormatUtils.addHours(currentDate, -Math.abs(FireFrameworkConf.maxTimerHour)) val timeOutSet = new mutable.HashSet[String]() this.timerCountTable.rowMap().foreach(kmap => { kmap._2.filter(_ != null).foreach(kv => { if (kv._1.compareTo(criticalTime) <= 0 && StringUtils.isNotBlank(kmap._1) && StringUtils.isNotBlank(kv._1)) { timeOutSet += kmap._1 + "#" + kv._1 } }) }) timeOutSet.filter(StringUtils.isNotBlank).map(t => (t.split("#"))).foreach(kv => { this.timerCountTable.remove(kv(0), kv(1)) }) this.lastClearTime = currentDate } } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/acc/StringAccumulator.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.acc import com.zto.fire._ import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.util.Logging import org.apache.spark.util.AccumulatorV2 import java.util.concurrent.ConcurrentLinkedQueue /** * Fire框架String类型累加器 * * @author ChengLong 2022-08-24 14:44:55 * @since 2.3.2 */ private[fire] class StringAccumulator extends AccumulatorV2[String, ConcurrentLinkedQueue[String]] with Logging { // 用于存放字符串的队列 private val queue = new ConcurrentLinkedQueue[String] // 判断是否打开日志累加器 protected lazy val isEnable = FireFrameworkConf.accEnable /** * 判断累加器是否为空 */ override def isZero: Boolean = this.queue.size() == 0 /** * 用于复制累加器 */ override def copy(): AccumulatorV2[String, ConcurrentLinkedQueue[String]] = { val strAcc = new StringAccumulator strAcc.value.addAll(this.queue) strAcc } /** * driver端执行有效,用于清空累加器 */ override def reset(): Unit = this.queue.clear /** * executor端执行,用于收集日志信息 * * @param log * 日志信息 */ override def add(str: String): Unit = { if (this.isEnable && noEmpty(str)) { this.queue.add(str) this.clear } } /** * executor端向driver端merge累加数据 * * @param other * executor端累加结果 */ override def merge(other: AccumulatorV2[String, ConcurrentLinkedQueue[String]]): Unit = { if (other != null && other.value.size() > 0) { this.queue.addAll(other.value) other.value.foreach(t => this.queue.add(t)) this.clear } } /** * driver端获取累加器的值 * * @return * 收集到的日志信息 */ override def value: ConcurrentLinkedQueue[String] = this.queue /** * 当日志累积量超过maxLogSize所设定的值时清理过期的日志数据 * 直到达到minLogSize所设定的最小值,防止频繁的进行清理 */ def clear: Unit = { if (this.queue.size() > FireFrameworkConf.maxLogSize) { while (this.queue.size() > FireFrameworkConf.minLogSize) { this.queue.poll } } } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/acc/SyncAccumulator.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.acc import org.apache.spark.util.AccumulatorV2 import java.util.concurrent.ConcurrentLinkedQueue /** * fire框架内部信息同步累加器 * * @author 2022-08-24 14:49:55 * @since 2.3.2 */ private[fire] class SyncAccumulator extends StringAccumulator { /** * 用于复制累加器 */ override def copy(): AccumulatorV2[String, ConcurrentLinkedQueue[String]] = new SyncAccumulator } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/conf/FireSparkConf.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.conf import com.zto.fire.common.util.PropUtils /** * Spark引擎相关配置 * * @author ChengLong * @since 1.1.0 * @create 2020-07-13 14:57 */ private[fire] object FireSparkConf { lazy val SPARK_APP_NAME = "spark.appName" lazy val SPARK_LOCAL_CORES = "spark.local.cores" lazy val SPARK_LOG_LEVEL = "spark.log.level" lazy val SPARK_SAVE_MODE = "spark.saveMode" lazy val SPARK_PARALLELISM = "spark.parallelism" lazy val SPARK_CHK_POINT_DIR = "spark.chkpoint.dir" lazy val SPARK_SQL_EXTENSIONS_ENABLE = "spark.fire.sql.extensions.enable" // spark datasource v2 api中的options配置key前缀 lazy val SPARK_DATASOURCE_OPTIONS_PREFIX = "spark.datasource.options." lazy val SPARK_DATASOURCE_FORMAT = "spark.datasource.format" lazy val SPARK_DATSOURCE_SAVE_MODE = "spark.datasource.saveMode" // 用于dataFrame.write.format.save()参数 lazy val SPARK_DATASOURCE_SAVE_PARAM = "spark.datasource.saveParam" lazy val SPARK_DATASOURCE_IS_SAVE_TABLE = "spark.datasource.isSaveTable" // 用于spark.read.format.load()参数 lazy val SPARK_DATASOURCE_LOAD_PARAM = "spark.datasource.loadParam" // spark 默认的checkpoint地址 lazy val sparkChkPointDir = "hdfs://nameservice1/user/spark/ckpoint/" // spark streaming批次时间 lazy val SPARK_STREAMING_BATCH_DURATION = "spark.streaming.batch.duration" // spark streaming的remember时间,-1表示不生效(ms) lazy val SPARK_STREAMING_REMEMBER = "spark.streaming.remember" // 当stage失败多少个时退出整个SparkSession lazy val SPARK_FIRE_STAGE_MAXFAILURES = "spark.fire.stage.maxFailures" // spark streaming的remember时间,-1表示不生效(ms) def streamingRemember: Long = PropUtils.getLong(this.SPARK_STREAMING_REMEMBER, -1) lazy val appName = PropUtils.getString(this.SPARK_APP_NAME, "") lazy val localCores = PropUtils.getString(this.SPARK_LOCAL_CORES, "*") lazy val logLevel = PropUtils.getString(this.SPARK_LOG_LEVEL, "info").toUpperCase lazy val saveMode = PropUtils.getString(this.SPARK_SAVE_MODE, "Append") lazy val parallelism = PropUtils.getInt(this.SPARK_PARALLELISM, 200) lazy val chkPointDirPrefix = PropUtils.getString(this.SPARK_CHK_POINT_DIR, this.sparkChkPointDir) lazy val confBathDuration = PropUtils.getInt(this.SPARK_STREAMING_BATCH_DURATION, -1) // 是否启用spark sql解析器扩展 lazy val sqlExtensionsEnable = PropUtils.getBoolean(this.SPARK_SQL_EXTENSIONS_ENABLE, true) /** * spark datasource api中的format参数 */ def datasourceFormat(keyNum: Int = 1): String = PropUtils.getString(this.SPARK_DATASOURCE_FORMAT, "", keyNum) /** * spark datasource api中的saveMode参数 */ def datasourceSaveMode(keyNum: Int = 1): String = PropUtils.getString(this.SPARK_DATSOURCE_SAVE_MODE, "Append", keyNum) /** * spark datasource api中的save方法参数 */ def datasourceSaveParam(keyNum: Int = 1): String = PropUtils.getString(this.SPARK_DATASOURCE_SAVE_PARAM, "", keyNum) /** * spark datasource api中的isSaveTable方法 */ def datasourceIsSaveTable(keyNum: Int = 1): String = PropUtils.getString(this.SPARK_DATASOURCE_IS_SAVE_TABLE, "", keyNum) /** * spark datasource api中的load方法参数 */ def datasourceLoadParam(keyNum: Int = 1): String = PropUtils.getString(this.SPARK_DATASOURCE_LOAD_PARAM, "", keyNum) /** * 当stage失败次数大于该值时SparkSession退出 */ def stageMaxFailures: Int = PropUtils.getInt(this.SPARK_FIRE_STAGE_MAXFAILURES, -1) } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/conf/SparkAnnoManager.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.conf import com.zto.fire.common.conf.FireFrameworkConf.FIRE_JOB_AUTO_START import com.zto.fire.common.util.PropUtils import com.zto.fire.core.conf.AnnoManager import com.zto.fire.spark.anno.{SparkConf, Streaming, StreamingDuration} /** * 注解管理器,用于将主键中的配置信息映射为键值对信息 * * @author ChengLong 2022-04-26 11:19:00 * @since 2.2.2 */ private[fire] class SparkAnnoManager extends AnnoManager { /** * 将@StreamingDuration中配置的信息映射为键值对形式 * @param StreamingDuration * StreamingDuration注解实例 */ def mapStreamingDuration(streaming: StreamingDuration): Unit = { this.put(FireSparkConf.SPARK_STREAMING_BATCH_DURATION, streaming.value()) this.put(FireSparkConf.SPARK_STREAMING_BATCH_DURATION, streaming.interval()) this.put("spark.streaming.receiver.writeAheadLog.enable", streaming.checkpoint()) } /** * 将@Streaming中配置的信息映射为键值对形式 * @param Streaming * Streaming注解实例 */ def mapStreaming(streaming: Streaming): Unit = { this.put(FireSparkConf.SPARK_STREAMING_BATCH_DURATION, streaming.value()) this.put(FireSparkConf.SPARK_STREAMING_BATCH_DURATION, streaming.interval()) this.put("spark.streaming.receiver.writeAheadLog.enable", streaming.checkpoint()) this.put("spark.streaming.backpressure.enabled", streaming.backpressure()) this.put("spark.streaming.concurrentJobs", streaming.concurrent()) this.put("spark.streaming.stopGracefullyOnShutdown", streaming.stopGracefullyOnShutdown()) this.put("spark.streaming.kafka.maxRatePerPartition", streaming.maxRatePerPartition()) this.put("spark.streaming.backpressure.initialRate", streaming.backpressureInitialRate()) this.put("spark.rocket.pull.max.speed.per.partition", streaming.maxRatePerPartition()) this.put(FIRE_JOB_AUTO_START, streaming.autoStart()) } /** * 将@SparkConf中配置的信息映射为键值对形式 */ def mapSparkConf(sparkConf: SparkConf): Unit = { val valueConf = PropUtils.parseTextConfig(sparkConf.value()) valueConf.foreach(kv => PropUtils.setNormalProperty(kv._1, kv._2)) sparkConf.props().foreach(prop => { val conf = prop.split("=", 2) if (conf != null && conf.length == 2) { PropUtils.setNormalProperty(conf(0), conf(1)) } }) } /** * 用于注册需要映射配置信息的自定义主键 */ override protected[fire] def register: Unit = { AnnoManager.registerAnnoSet.add(classOf[SparkConf]) AnnoManager.registerAnnoSet.add(classOf[StreamingDuration]) AnnoManager.registerAnnoSet.add(classOf[Streaming]) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/connector/BeanGenReceiver.scala ================================================ package com.zto.fire.spark.connector import com.zto.fire._ import com.zto.fire.common.util.{Logging, ReflectionUtils, ThreadUtils} import com.zto.fire.spark.bean.GenerateBean import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.receiver.Receiver import scala.reflect.{ClassTag, classTag} /** * JavaBean动态生成Receiver,用于随机生成JavaBean实例 * JavaBean必须是GenerateBean的子类,并且必须实现generate方法 * * @param delay 数据生成的间隔时间(ms) * @author ChengLong 2022-03-07 14:30:55 * @since 2.2.1 */ class BeanGenReceiver[T <: GenerateBean[T] : ClassTag](delay: Long = 1000) extends Receiver[T](StorageLevel.MEMORY_AND_DISK_SER) with Logging { private lazy val generate = "generate" /** * 生命周期方法,开始receive */ override def onStart(): Unit = { this.logInfo("开始启动BeanGenReceiver") ThreadUtils.runAsSingle(receive()) } /** * 接受T类型自动生成的JavaBean对象实例 */ def receive(): Unit = { val clazz = classTag[T].runtimeClass ReflectionUtils.getMethodByName(clazz, generate) val method = clazz.getDeclaredMethod(generate) val emptyInstance = clazz.newInstance() while (true) { this.store(method.invoke(emptyInstance).asInstanceOf[JList[T]].toIterator) Thread.sleep(delay) } } override def onStop(): Unit = { this.logWarning("停止BeanGenReceiver.") } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/connector/DataGenReceiver.scala ================================================ package com.zto.fire.spark.connector import com.zto.fire.common.util.{Logging, ThreadUtils} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.receiver.Receiver import scala.collection.mutable /** * 数据动态生成Receiver,数据的生成逻辑使用generateFun函数来定义 * * @param delay 数据生成的间隔时间(ms) * @param generateFun 数据生成的函数 * @author ChengLong 2022-03-07 14:10:55 * @since 2.2.1 */ class DataGenReceiver[T](delay: Long = 1000, generateFun: => mutable.Buffer[T]) extends Receiver[T](StorageLevel.MEMORY_AND_DISK_SER) with Logging { /** * 生命周期方法,开始receive */ override def onStart(): Unit = { this.logInfo("开始启动DataGenReceiver.") ThreadUtils.runAsSingle(receive()) } /** * 接受T类型自动生成的JavaBean对象实例 */ def receive(): Unit = { while (true) { this.store(this.generateFun.toIterator) Thread.sleep(delay) } } override def onStop(): Unit = { this.logWarning("停止DataGenReceiver.") } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/connector/HBaseBulkConnector.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.connector import com.zto.fire.common.anno.Internal import com.zto.fire.core.connector.{Connector, ConnectorFactory} import com.zto.fire.hbase.HBaseConnector import com.zto.fire.hbase.bean.{HBaseBaseBean, MultiVersionsBean} import com.zto.fire.hbase.conf.FireHBaseConf import com.zto.fire.predef._ import com.zto.fire.spark.conf.FireSparkConf import com.zto.fire.spark.util.{SparkSingletonFactory, SparkUtils} import org.apache.commons.lang3.StringUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase.TableName import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.mapreduce.TableOutputFormat import org.apache.hadoop.hbase.spark.HBaseContext import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.mapreduce.Job import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, Encoders, Row} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.DStream import scala.collection.mutable.ListBuffer import scala.reflect.ClassTag /** * HBase直连工具类,基于HBase-Spark API开发 * 具有更强大的性能和更低的资源开销,适用于 * 与Spark相结合的大数据量操作,优点体现在并行 * 和大数据量。如果数据量不大,仍推荐使用 * HBaseConnector进行相关的操作 * * @param sc * SparkContext实例 * @param config * HBase相关配置参数 * @author ChengLong 2018年4月10日 10:39:28 */ class HBaseBulkConnector(@scala.transient sc: SparkContext, @scala.transient config: Configuration, batchSize: Int = 10000, keyNum: Int = 1) extends HBaseContext(sc, config) with Connector { private[fire] lazy val finalBatchSize = if (FireHBaseConf.hbaseBatchSize(this.keyNum) != -1) FireHBaseConf.hbaseBatchSize(this.keyNum) else this.batchSize private[this] lazy val sparkSession = SparkSingletonFactory.getSparkSession @transient private[this] lazy val tableConfMap = new JConcurrentHashMap[String, Configuration]() /** * 根据RDD[String]批量删除,rdd是rowkey的集合 * 类型为String * * @param rdd * 类型为String的RDD数据集 * @param tableName * HBase表名 */ def bulkDeleteRDD(tableName: String, rdd: RDD[String]): Unit = { requireNonEmpty(tableName, rdd) tryWithLog { val rowKeyRDD = rdd.filter(rowkey => StringUtils.isNotBlank(rowkey)).map(rowKey => Bytes.toBytes(rowKey)) this.bulkDelete[Array[Byte]](rowKeyRDD, TableName.valueOf(tableName), rec => new Delete(rec), this.finalBatchSize) }(this.logger, s"execute bulkDeleteRDD(tableName: ${tableName}, batchSize: ${batchSize}) success. keyNum: ${keyNum}") } /** * 根据Dataset[String]批量删除,Dataset是rowkey的集合 * 类型为String * * @param dataset * 类型为String的Dataset集合 * @param tableName * HBase表名 */ def bulkDeleteDS(tableName: String, dataset: Dataset[String]): Unit = { requireNonEmpty(tableName, dataset) tryWithLog { this.bulkDeleteRDD(tableName, dataset.rdd) }(this.logger, s"execute bulkDeleteDS(tableName: ${tableName}, batchSize: ${finalBatchSize}) success. keyNum: ${keyNum}") } /** * 指定rowkey集合,进行批量删除操作内部会将这个集合转为RDD * 推荐在较大量数据时使用,小数据量的删除操作仍推荐使用HBaseConnector * * @param tableName * HBase表名 * @param seq * 待删除的rowKey集合 */ def bulkDeleteList(tableName: String, seq: Seq[String]): Unit = { requireNonEmpty(tableName, seq) tryWithLog { val rdd = sc.parallelize(seq, math.max(1, math.min(seq.length / 2, FireSparkConf.parallelism))) this.bulkDeleteRDD(tableName, rdd) }(this.logger, s"execute bulkDeleteList(tableName: ${tableName}) success. keyNum: ${keyNum}") } /** * 根据rowKey集合批量获取数据,并映射为自定义的JavaBean类型 * * @param tableName * HBase表名 * @param rdd * rowKey集合,类型为RDD[String] * @param clazz * 获取后的记录转换为目标类型(自定义的JavaBean类型) * @tparam E * 自定义JavaBean类型,必须继承自HBaseBaseBean * @return * 自定义JavaBean的对象结果集 */ def bulkGetRDD[E <: HBaseBaseBean[E] : ClassTag](tableName: String, rdd: RDD[String], clazz: Class[E]): RDD[E] = { requireNonEmpty(tableName, rdd, clazz) tryWithReturn { val rowKeyRDD = rdd.filter(StringUtils.isNotBlank(_)).map(rowKey => Bytes.toBytes(rowKey)) val getRDD = this.bulkGet[Array[Byte], E](TableName.valueOf(tableName), batchSize, rowKeyRDD, rowKey => new Get(rowKey), (result: Result) => { HBaseConnector(keyNum = this.keyNum).hbaseRow2Bean(result, clazz).getOrElse(clazz.newInstance()) }).filter(bean => noEmpty(bean, bean.rowKey)).persist(StorageLevel.fromString(FireHBaseConf.hbaseStorageLevel(this.keyNum))) getRDD }(this.logger, s"execute bulkGetRDD(tableName: ${tableName}, batchSize: ${finalBatchSize}) success. keyNum: ${keyNum}") } /** * 根据rowKey集合批量获取数据,并映射为自定义的JavaBean类型 * * @param tableName * HBase表名 * @param rdd * rowKey集合,类型为RDD[String] * @param clazz * 获取后的记录转换为目标类型(自定义的JavaBean类型) * @tparam E * 自定义JavaBean类型,必须继承自HBaseBaseBean * @return * 自定义JavaBean的对象结果集 */ def bulkGetDF[E <: HBaseBaseBean[E] : ClassTag](tableName: String, rdd: RDD[String], clazz: Class[E]): DataFrame = { requireNonEmpty(tableName, rdd, clazz) tryWithReturn { val resultRdd = this.bulkGetRDD[E](tableName, rdd, clazz) this.sparkSession.createDataFrame(resultRdd, clazz) }(this.logger, s"execute bulkGetDF(tableName: ${tableName}, batchSize: ${finalBatchSize}) success. keyNum: ${keyNum}") } /** * 根据rowKey集合批量获取数据,并映射为自定义的JavaBean类型 * * @param tableName * HBase表名 * @param rdd * rowKey集合,类型为RDD[String] * @param clazz * 获取后的记录转换为目标类型(自定义的JavaBean类型) * @tparam E * 自定义JavaBean类型,必须继承自HBaseBaseBean * @return * 自定义JavaBean的对象结果集 */ def bulkGetDS[E <: HBaseBaseBean[E] : ClassTag](tableName: String, rdd: RDD[String], clazz: Class[E]): Dataset[E] = { requireNonEmpty(tableName, rdd, clazz) tryWithReturn { val resultRdd = this.bulkGetRDD[E](tableName, rdd, clazz) this.sparkSession.createDataset(resultRdd)(Encoders.bean(clazz)) }(this.logger, s"execute bulkGetDS(tableName: ${tableName}, batchSize: ${finalBatchSize}) success. keyNum: ${keyNum}") } /** * 根据rowKey集合批量获取数据,并映射为自定义的JavaBean类型 * 内部实现是将rowkey集合转为RDD[String],推荐在数据量较大 * 时使用。数据量较小请优先使用HBaseConnector * * @param tableName * HBase表名 * @param clazz * 具体类型 * @param seq * rowKey集合 * @tparam E * 自定义JavaBean类型,必须继承自HBaseBaseBean * @return * 自定义JavaBean的对象结果集 */ def bulkGetSeq[E <: HBaseBaseBean[E] : ClassTag](tableName: String, seq: Seq[String], clazz: Class[E]): RDD[E] = { requireNonEmpty(tableName, seq, clazz) tryWithReturn { val rdd = sc.parallelize(seq, math.max(1, math.min(seq.length / 2, FireSparkConf.parallelism))) this.bulkGetRDD(tableName, rdd, clazz) }(this.logger, s"execute bulkGetSeq(tableName: ${tableName}, batchSize: ${finalBatchSize}) success. keyNum: ${keyNum}") } /** * 批量写入,将自定义的JavaBean数据集批量并行写入 * 到HBase的指定表中。内部会将自定义JavaBean的相应 * 字段一一映射为Put对象,并完成一次写入 * * @param tableName * HBase表名 * @param rdd * 数据集合,数类型需继承自HBaseBaseBean * @tparam T * 数据类型为HBaseBaseBean的子类 */ def bulkPutRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, rdd: RDD[T]): Unit = { requireNonEmpty(tableName, rdd) tryWithLog { this.bulkPut[T](rdd, TableName.valueOf(tableName), (putRecord: T) => { HBaseConnector(keyNum = this.keyNum).convert2Put[T](if (HBaseConnector(keyNum = this.keyNum).getMultiVersion[T]) new MultiVersionsBean(putRecord).asInstanceOf[T] else putRecord, HBaseConnector(keyNum = this.keyNum).getNullable[T]) }) }(this.logger, s"execute bulkPutRDD(tableName: ${tableName}) success. keyNum: ${keyNum}") } /** * 批量写入,将自定义的JavaBean数据集批量并行写入 * 到HBase的指定表中。内部会将自定义JavaBean的相应 * 字段一一映射为Put对象,并完成一次写入。如果数据量 * 较大,推荐使用。数据量过小则推荐使用HBaseConnector * * @param tableName * HBase表名 * @param seq * 数据集,类型为HBaseBaseBean的子类 * @tparam T * 对象类型必须是HBaseBaseBean的子类 */ def bulkPutSeq[T <: HBaseBaseBean[T] : ClassTag](tableName: String, seq: Seq[T]): Unit = { requireNonEmpty(tableName, seq) tryWithLog { val rdd = this.sc.parallelize(seq, math.max(1, math.min(seq.length / 2, FireSparkConf.parallelism))) this.bulkPutRDD(tableName, rdd) }(this.logger, s"execute bulkPutRDD(tableName: ${tableName}) success. keyNum: ${keyNum}") } /** * 定制化scan设置后从指定的表中scan数据 * 并将scan到的结果集映射为自定义JavaBean对象 * * @param tableName * HBase表名 * @param scan * scan对象 * @param clazz * 自定义JavaBean的Class对象 * @tparam T * 对象类型必须是HBaseBaseBean的子类 * @return * scan获取到的结果集,类型为RDD[T] */ def bulkScanRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan)(implicit canOverload: Boolean = true): RDD[T] = { requireNonEmpty(tableName, scan, clazz) tryWithReturn { if (scan.getCaching == -1) { scan.setCaching(this.finalBatchSize) } this.hbaseRDD(TableName.valueOf(tableName), scan).mapPartitions(it => HBaseConnector(keyNum = this.keyNum).hbaseRow2BeanList(it, clazz)).persist(StorageLevel.fromString(FireHBaseConf.hbaseStorageLevel(this.keyNum))) }(this.logger, s"execute bulkScanRDD(tableName: ${tableName}) success. keyNum: ${keyNum}") } /** * 指定startRow和stopRow后自动创建scan对象完成数据扫描 * 并将scan到的结果集映射为自定义JavaBean对象 * * @param tableName * HBase表名 * @param startRow * rowkey的起始 * @param stopRow * rowkey的结束 * @param clazz * 自定义JavaBean的Class对象 * @tparam T * 对象类型必须是HBaseBaseBean的子类 * @return * scan获取到的结果集,类型为RDD[T] */ def bulkScanRDD2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String): RDD[T] = { requireNonEmpty(tableName, clazz, startRow, stopRow) this.bulkScanRDD(tableName, clazz, HBaseConnector.buildScan(startRow, stopRow)) } /** * 批量写入,将自定义的JavaBean数据集批量并行写入 * 到HBase的指定表中。内部会将自定义JavaBean的相应 * 字段一一映射为Put对象,并完成一次写入 * * @param tableName * HBase表名 * @param dataFrame * dataFrame实例,数类型需继承自HBaseBaseBean * @tparam T * 数据类型为HBaseBaseBean的子类 */ def bulkPutDF[T <: HBaseBaseBean[T] : ClassTag](tableName: String, dataFrame: DataFrame, clazz: Class[T]): Unit = { requireNonEmpty(tableName, dataFrame, clazz) val rdd = dataFrame.rdd.mapPartitions(it => SparkUtils.sparkRowToBean(it, clazz)) this.bulkPutRDD[T](tableName, rdd) } /** * 批量写入,将自定义的JavaBean数据集批量并行写入 * 到HBase的指定表中。内部会将自定义JavaBean的相应 * 字段一一映射为Put对象,并完成一次写入 * * @param tableName * HBase表名 * @param dataset * dataFrame实例,数类型需继承自HBaseBaseBean * @tparam T * 数据类型为HBaseBaseBean的子类 */ def bulkPutDS[T <: HBaseBaseBean[T] : ClassTag](tableName: String, dataset: Dataset[T]): Unit = { requireNonEmpty(tableName, dataset) this.bulkPutRDD[T](tableName, dataset.rdd) } /** * 用于已经映射为指定类型的DStream实时 * 批量写入至HBase表中 * * @param tableName * HBase表名 * @param dstream * 类型为自定义JavaBean的DStream流 * @tparam T * 对象类型必须是HBaseBaseBean的子类 */ def bulkPutStream[T <: HBaseBaseBean[T] : ClassTag](tableName: String, dstream: DStream[T]): Unit = { requireNonEmpty(tableName, dstream) tryWithLog { this.streamBulkPut[T](dstream, TableName.valueOf(tableName), (putRecord: T) => { HBaseConnector(keyNum = this.keyNum).convert2Put[T](if (HBaseConnector(keyNum = this.keyNum).getMultiVersion[T]) new MultiVersionsBean(putRecord).asInstanceOf[T] else putRecord, HBaseConnector(keyNum = this.keyNum).getNullable[T]) }) }(this.logger, s"execute bulkPutStream(tableName: ${tableName}) success. keyNum: ${keyNum}") } /** * 以spark 方式批量将rdd数据写入到hbase中 * * @param rdd * 类型为HBaseBaseBean子类的rdd * @param tableName * hbase表名 * @tparam T * 数据类型 */ def hadoopPut[T <: HBaseBaseBean[T] : ClassTag](tableName: String, rdd: RDD[T]): Unit = { requireNonEmpty(tableName, rdd) tryWithLog { rdd.mapPartitions(it => { val putList = ListBuffer[(ImmutableBytesWritable, Put)]() it.foreach(t => { putList += Tuple2(new ImmutableBytesWritable(), HBaseConnector(keyNum = this.keyNum).convert2Put[T](t, HBaseConnector(keyNum = this.keyNum).getNullable[T])) }) putList.iterator }).saveAsNewAPIHadoopDataset(this.getConfiguration(tableName)) }(this.logger, s"execute hadoopPut(tableName: ${tableName}) success. keyNum: ${keyNum}") } /** * 使用spark API的方式将DataFrame中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 * @param clazz * JavaBean类型,为HBaseBaseBean的子类 */ def hadoopPutDF[E <: HBaseBaseBean[E] : ClassTag](tableName: String, dataFrame: DataFrame, clazz: Class[E]): Unit = { requireNonEmpty(tableName, dataFrame, clazz) val rdd = dataFrame.rdd.mapPartitions(it => SparkUtils.sparkRowToBean(it, clazz)) this.hadoopPut[E](tableName, rdd) } /** * 使用spark API的方式将DataFrame中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 * @param dataset * JavaBean类型,待插入到hbase的数据集 */ def hadoopPutDS[E <: HBaseBaseBean[E] : ClassTag](tableName: String, dataset: Dataset[E]): Unit = { requireNonEmpty(tableName, dataset)("参数不合法:dataset不能为空") this.hadoopPut[E](tableName, dataset.rdd) } /** * 以spark 方式批量将DataFrame数据写入到hbase中 * 注:此方法与hbaseHadoopPutDF不同之处在于,它不强制要求该DataFrame一定要与HBaseBaseBean的子类对应 * 但需要指定rowKey的构建规则,相对与hbaseHadoopPutDF来说,少了中间的两次转换,性能会更高 * * @param df * spark的DataFrame * @param tableName * hbase表名 * @tparam T * JavaBean类型 */ def hadoopPutDFRow[T <: HBaseBaseBean[T] : ClassTag](tableName: String, df: DataFrame, buildRowKey: (Row) => String): Unit = { requireNonEmpty(tableName, df) val insertEmpty = HBaseConnector(keyNum = this.keyNum).getNullable[T] tryWithLog { val fields = df.schema.fields df.rdd.mapPartitions(it => { var count = 0 val putList = ListBuffer[(ImmutableBytesWritable, Put)]() it.foreach(row => { val put = new Put(Bytes.toBytes(buildRowKey(row))) fields.foreach(field => { val fieldName = field.name val fieldIndex = row.fieldIndex(fieldName) val dataType = field.dataType.getClass.getSimpleName var fieldValue: Any = null if (!row.isNullAt(fieldIndex)) { fieldValue = row.get(fieldIndex) if (dataType.contains("StringType")) { put.addColumn(Bytes.toBytes("info"), Bytes.toBytes(fieldName), Bytes.toBytes(fieldValue.asInstanceOf[java.lang.String])) } else if (dataType.contains("IntegerType")) { put.addColumn(Bytes.toBytes("info"), Bytes.toBytes(fieldName), Bytes.toBytes(fieldValue.asInstanceOf[java.lang.Integer])) } else if (dataType.contains("DoubleType")) { put.addColumn(Bytes.toBytes("info"), Bytes.toBytes(fieldName), Bytes.toBytes(fieldValue.asInstanceOf[java.lang.Double])) } else if (dataType.contains("LongType")) { put.addColumn(Bytes.toBytes("info"), Bytes.toBytes(fieldName), Bytes.toBytes(fieldValue.asInstanceOf[java.lang.Long])) } else if (dataType.contains("DecimalType")) { put.addColumn(Bytes.toBytes("info"), Bytes.toBytes(fieldName), Bytes.toBytes(fieldValue.asInstanceOf[java.math.BigDecimal])) } else if (dataType.contains("FloatType")) { put.addColumn(Bytes.toBytes("info"), Bytes.toBytes(fieldName), Bytes.toBytes(fieldValue.asInstanceOf[java.lang.Float])) } else if (dataType.contains("BooleanType")) { put.addColumn(Bytes.toBytes("info"), Bytes.toBytes(fieldName), Bytes.toBytes(fieldValue.asInstanceOf[java.lang.Boolean])) } else if (dataType.contains("ShortType")) { put.addColumn(Bytes.toBytes("info"), Bytes.toBytes(fieldName), Bytes.toBytes(fieldValue.asInstanceOf[java.lang.Short])) } else if (dataType.contains("NullType") && insertEmpty) { put.addColumn(Bytes.toBytes("info"), Bytes.toBytes(fieldName), null) } } else if (insertEmpty) { put.addColumn(Bytes.toBytes("info"), Bytes.toBytes(fieldName), null) } }) putList += Tuple2(new ImmutableBytesWritable, put) count += putList.size }) putList.iterator }).saveAsNewAPIHadoopDataset(this.getConfiguration(tableName)) }(this.logger, s"execute hadoopPut(tableName: ${tableName}) success. keyNum: ${keyNum}") } /** * 根据表名构建hadoop configuration * * @param tableName * HBase表名 * @return * hadoop configuration */ @Internal private[this] def getConfiguration(tableName: String): Configuration = { requireNonEmpty(tableName) if (!this.tableConfMap.containsKey(tableName)) { val hadoopConfiguration = this.config hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tableName) val job = Job.getInstance(hadoopConfiguration) job.setOutputKeyClass(classOf[ImmutableBytesWritable]) job.setOutputValueClass(classOf[Result]) job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]]) this.tableConfMap.put(tableName, job.getConfiguration) } this.tableConfMap.get(tableName) } } /** * 用于单例构建伴生类HBaseContextExt的实例对象 * 每个HBaseContextExt实例使用keyNum作为标识,并且与每个HBase集群一一对应 */ object HBaseBulkConnector extends ConnectorFactory[HBaseBulkConnector] with HBaseBulkFunctions { /** * 创建指定集群标识的HBaseContextExt对象实例 */ override protected def create(conf: Any = null, keyNum: Int = 1): HBaseBulkConnector = { val hadoopConf = if (conf != null) conf.asInstanceOf[Configuration] else HBaseConnector.getConfiguration(keyNum) val connector = new HBaseBulkConnector(SparkSingletonFactory.getSparkSession.sparkContext, hadoopConf, keyNum) connector } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/connector/HBaseBulkFunctions.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.connector import com.zto.fire.hbase.bean.HBaseBaseBean import org.apache.hadoop.hbase.client.Scan import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.streaming.dstream.DStream import scala.reflect.ClassTag /** * HBase Bulk api库 * * @author ChengLong * @since 2.0.0 * @create 2020-12-23 15:46 */ trait HBaseBulkFunctions { /** * 根据RDD[String]批量删除,rdd是rowkey的集合 * 类型为String * * @param rdd * 类型为String的RDD数据集 * @param tableName * HBase表名 */ def bulkDeleteRDD(tableName: String, rdd: RDD[String], keyNum: Int = 1): Unit = { HBaseBulkConnector(keyNum = keyNum).bulkDeleteRDD(tableName, rdd) } /** * 根据Dataset[String]批量删除,Dataset是rowkey的集合 * 类型为String * * @param dataset * 类型为String的Dataset集合 * @param tableName * HBase表名 */ def bulkDeleteDS(tableName: String, dataset: Dataset[String], keyNum: Int = 1): Unit = { HBaseBulkConnector(keyNum = keyNum).bulkDeleteDS(tableName, dataset) } /** * 指定rowkey集合,进行批量删除操作内部会将这个集合转为RDD * 推荐在较大量数据时使用,小数据量的删除操作仍推荐使用HBaseConnector * * @param tableName * HBase表名 * @param seq * 待删除的rowKey集合 */ def bulkDeleteList(tableName: String, seq: Seq[String], keyNum: Int = 1): Unit = { HBaseBulkConnector(keyNum = keyNum).bulkDeleteList(tableName, seq) } /** * 根据rowKey集合批量获取数据,并映射为自定义的JavaBean类型 * * @param tableName * HBase表名 * @param rdd * rowKey集合,类型为RDD[String] * @param clazz * 获取后的记录转换为目标类型(自定义的JavaBean类型) * @tparam E * 自定义JavaBean类型,必须继承自HBaseBaseBean * @return * 自定义JavaBean的对象结果集 */ def bulkGetRDD[E <: HBaseBaseBean[E] : ClassTag](tableName: String, rdd: RDD[String], clazz: Class[E], keyNum: Int = 1): RDD[E] = { HBaseBulkConnector(keyNum = keyNum).bulkGetRDD[E](tableName, rdd, clazz) } /** * 根据rowKey集合批量获取数据,并映射为自定义的JavaBean类型 * * @param tableName * HBase表名 * @param rdd * rowKey集合,类型为RDD[String] * @param clazz * 获取后的记录转换为目标类型(自定义的JavaBean类型) * @tparam E * 自定义JavaBean类型,必须继承自HBaseBaseBean * @return * 自定义JavaBean的对象结果集 */ def bulkGetDF[E <: HBaseBaseBean[E] : ClassTag](tableName: String, rdd: RDD[String], clazz: Class[E], keyNum: Int = 1): DataFrame = { HBaseBulkConnector(keyNum = keyNum).bulkGetDF[E](tableName, rdd, clazz) } /** * 根据rowKey集合批量获取数据,并映射为自定义的JavaBean类型 * * @param tableName * HBase表名 * @param rdd * rowKey集合,类型为RDD[String] * @param clazz * 获取后的记录转换为目标类型(自定义的JavaBean类型) * @tparam E * 自定义JavaBean类型,必须继承自HBaseBaseBean * @return * 自定义JavaBean的对象结果集 */ def bulkGetDS[E <: HBaseBaseBean[E] : ClassTag](tableName: String, rdd: RDD[String], clazz: Class[E], keyNum: Int = 1): Dataset[E] = { HBaseBulkConnector(keyNum = keyNum).bulkGetDS[E](tableName, rdd, clazz) } /** * 根据rowKey集合批量获取数据,并映射为自定义的JavaBean类型 * 内部实现是将rowkey集合转为RDD[String],推荐在数据量较大 * 时使用。数据量较小请优先使用HBaseConnector * * @param tableName * HBase表名 * @param clazz * 具体类型 * @param seq * rowKey集合 * @tparam E * 自定义JavaBean类型,必须继承自HBaseBaseBean * @return * 自定义JavaBean的对象结果集 */ def bulkGetSeq[E <: HBaseBaseBean[E] : ClassTag](tableName: String, seq: Seq[String], clazz: Class[E], keyNum: Int = 1): RDD[E] = { HBaseBulkConnector(keyNum = keyNum).bulkGetSeq[E](tableName, seq, clazz) } /** * 批量写入,将自定义的JavaBean数据集批量并行写入 * 到HBase的指定表中。内部会将自定义JavaBean的相应 * 字段一一映射为Put对象,并完成一次写入 * * @param tableName * HBase表名 * @param rdd * 数据集合,数类型需继承自HBaseBaseBean * @tparam T * 数据类型为HBaseBaseBean的子类 */ def bulkPutRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, rdd: RDD[T], keyNum: Int = 1): Unit = { HBaseBulkConnector(keyNum = keyNum).bulkPutRDD[T](tableName, rdd) } /** * 批量写入,将自定义的JavaBean数据集批量并行写入 * 到HBase的指定表中。内部会将自定义JavaBean的相应 * 字段一一映射为Put对象,并完成一次写入。如果数据量 * 较大,推荐使用。数据量过小则推荐使用HBaseConnector * * @param tableName * HBase表名 * @param seq * 数据集,类型为HBaseBaseBean的子类 * @tparam T * 对象类型必须是HBaseBaseBean的子类 */ def bulkPutSeq[T <: HBaseBaseBean[T] : ClassTag](tableName: String, seq: Seq[T], keyNum: Int = 1): Unit = { HBaseBulkConnector(keyNum = keyNum).bulkPutSeq[T](tableName, seq) } /** * 定制化scan设置后从指定的表中scan数据 * 并将scan到的结果集映射为自定义JavaBean对象 * * @param tableName * HBase表名 * @param scan * scan对象 * @param clazz * 自定义JavaBean的Class对象 * @tparam T * 对象类型必须是HBaseBaseBean的子类 * @return * scan获取到的结果集,类型为RDD[T] */ def bulkScanRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan, keyNum: Int = 1): RDD[T] = { HBaseBulkConnector(keyNum = keyNum).bulkScanRDD[T](tableName, clazz, scan) } /** * 指定startRow和stopRow后自动创建scan对象完成数据扫描 * 并将scan到的结果集映射为自定义JavaBean对象 * * @param tableName * HBase表名 * @param startRow * rowkey的起始 * @param stopRow * rowkey的结束 * @param clazz * 自定义JavaBean的Class对象 * @tparam T * 对象类型必须是HBaseBaseBean的子类 * @return * scan获取到的结果集,类型为RDD[T] */ def bulkScanRDD2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String, keyNum: Int = 1): RDD[T] = { HBaseBulkConnector(keyNum = keyNum).bulkScanRDD2[T](tableName, clazz, startRow, stopRow) } /** * 批量写入,将自定义的JavaBean数据集批量并行写入 * 到HBase的指定表中。内部会将自定义JavaBean的相应 * 字段一一映射为Put对象,并完成一次写入 * * @param tableName * HBase表名 * @param dataFrame * dataFrame实例,数类型需继承自HBaseBaseBean * @tparam T * 数据类型为HBaseBaseBean的子类 */ def bulkPutDF[T <: HBaseBaseBean[T] : ClassTag](tableName: String, dataFrame: DataFrame, clazz: Class[T], keyNum: Int = 1): Unit = { HBaseBulkConnector(keyNum = keyNum).bulkPutDF[T](tableName, dataFrame, clazz) } /** * 批量写入,将自定义的JavaBean数据集批量并行写入 * 到HBase的指定表中。内部会将自定义JavaBean的相应 * 字段一一映射为Put对象,并完成一次写入 * * @param tableName * HBase表名 * @param dataset * dataFrame实例,数类型需继承自HBaseBaseBean * @tparam T * 数据类型为HBaseBaseBean的子类 */ def bulkPutDS[T <: HBaseBaseBean[T] : ClassTag](tableName: String, dataset: Dataset[T], keyNum: Int = 1): Unit = { HBaseBulkConnector(keyNum = keyNum).bulkPutDS[T](tableName, dataset) } /** * 用于已经映射为指定类型的DStream实时 * 批量写入至HBase表中 * * @param tableName * HBase表名 * @param dstream * 类型为自定义JavaBean的DStream流 * @tparam T * 对象类型必须是HBaseBaseBean的子类 */ def bulkPutStream[T <: HBaseBaseBean[T] : ClassTag](tableName: String, dstream: DStream[T], keyNum: Int = 1): Unit = { HBaseBulkConnector(keyNum = keyNum).bulkPutStream[T](tableName, dstream) } /** * 以spark 方式批量将rdd数据写入到hbase中 * * @param rdd * 类型为HBaseBaseBean子类的rdd * @param tableName * hbase表名 * @tparam T * 数据类型 */ def hadoopPut[T <: HBaseBaseBean[T] : ClassTag](tableName: String, rdd: RDD[T], keyNum: Int = 1): Unit = { HBaseBulkConnector(keyNum = keyNum).hadoopPut[T](tableName, rdd) } /** * 使用spark API的方式将DataFrame中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 * @param clazz * JavaBean类型,为HBaseBaseBean的子类 */ def hadoopPutDF[E <: HBaseBaseBean[E] : ClassTag](tableName: String, dataFrame: DataFrame, clazz: Class[E], keyNum: Int = 1): Unit = { HBaseBulkConnector(keyNum = keyNum).hadoopPutDF[E](tableName, dataFrame, clazz) } /** * 使用spark API的方式将DataFrame中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 * @param dataset * JavaBean类型,待插入到hbase的数据集 */ def hadoopPutDS[E <: HBaseBaseBean[E] : ClassTag](tableName: String, dataset: Dataset[E], keyNum: Int = 1): Unit = { HBaseBulkConnector(keyNum = keyNum).hadoopPutDS[E](tableName, dataset) } /** * 以spark 方式批量将DataFrame数据写入到hbase中 * 注:此方法与hbaseHadoopPutDF不同之处在于,它不强制要求该DataFrame一定要与HBaseBaseBean的子类对应 * 但需要指定rowKey的构建规则,相对与hbaseHadoopPutDF来说,少了中间的两次转换,性能会更高 * * @param df * spark的DataFrame * @param tableName * hbase表名 * @tparam T * JavaBean类型 */ def hadoopPutDFRow[T <: HBaseBaseBean[T] : ClassTag](tableName: String, df: DataFrame, buildRowKey: (Row) => String, keyNum: Int = 1): Unit = { HBaseBulkConnector(keyNum = keyNum).hadoopPutDFRow[T](tableName, df, buildRowKey) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/connector/HBaseSparkBridge.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.connector import java.nio.charset.StandardCharsets import com.zto.fire.core.connector.{ConnectorFactory, FireConnector} import com.zto.fire.hbase.HBaseConnector import com.zto.fire.hbase.bean.HBaseBaseBean import com.zto.fire.hbase.conf.FireHBaseConf import com.zto.fire.hbase.utils.HBaseUtils import com.zto.fire.predef._ import com.zto.fire.spark.util.{SparkSingletonFactory, SparkUtils} import org.apache.commons.lang3.StringUtils import org.apache.hadoop.hbase.client.{Get, Result, Scan} import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.storage.StorageLevel import scala.collection.mutable.ListBuffer import scala.reflect.ClassTag /** * HBase-Spark桥,为Spark提供了使用Java API操作HBase的方式 * * @author ChengLong 2019-5-10 14:39:39 */ class HBaseSparkBridge(keyNum: Int = 1) extends FireConnector(keyNum = keyNum) { private[this] lazy val spark = SparkSingletonFactory.getSparkSession def batchSize: Int = FireHBaseConf.hbaseBatchSize() /** * 使用Java API的方式将DataFrame中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 * @param df * DataFrame * @param clazz * JavaBean类型,为HBaseBaseBean的子类 */ def hbasePutDF[E <: HBaseBaseBean[E] : ClassTag](tableName: String, clazz: Class[E], df: DataFrame): Unit = { df.mapPartitions(row => SparkUtils.sparkRowToBean(row, clazz))(Encoders.bean(clazz)).foreachPartition((it: Iterator[E]) => { this.multiBatchInsert(tableName, it) }) } /** * 使用Java API的方式将Dataset中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 * @param ds * DataSet[E]的具体类型必须为HBaseBaseBean的子类 * @param clazz * JavaBean类型,为HBaseBaseBean的子类 */ def hbasePutDS[E <: HBaseBaseBean[E] : ClassTag](tableName: String, clazz: Class[E], ds: Dataset[E]): Unit = { ds.foreachPartition((it: Iterator[E]) => { this.multiBatchInsert(tableName, it) }) } /** * 使用Java API的方式将RDD中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 */ def hbasePutRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, rdd: RDD[T]): Unit = { rdd.foreachPartition(it => { this.multiBatchInsert(tableName, it) }) } /** * Scan指定HBase表的数据,并映射为DataFrame * * @param tableName * HBase表名 * @param scan * scan对象 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseScanDF[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan): DataFrame = { val beanRDD = this.hbaseScanRDD(tableName, clazz, scan) // 将rdd转为DataFrame this.spark.createDataFrame(beanRDD, clazz) } /** * Scan指定HBase表的数据,并映射为Dataset * * @param tableName * HBase表名 * @param scan * scan对象 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseScanDS[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan): Dataset[T] = { val beanRDD = this.hbaseScanRDD(tableName, clazz, scan) spark.createDataset(beanRDD)(Encoders.bean(clazz)) } /** * Scan指定HBase表的数据,并映射为Dataset * * @param tableName * HBase表名 * @param startRow * 开始主键 * @param stopRow 结束主键 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseScanDS2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String): Dataset[T] = { this.hbaseScanDS[T](tableName, clazz, HBaseConnector.buildScan(startRow, stopRow)) } /** * Scan指定HBase表的数据,并映射为RDD[(ImmutableBytesWritable, Result)] * * @param tableName * HBase表名 * @param scan * scan对象 * 目标类型 * @return */ def hbaseHadoopScanRS(tableName: String, scan: Scan): RDD[(ImmutableBytesWritable, Result)] = { val hbaseConf = HBaseConnector(keyNum = this.keyNum).getConfiguration hbaseConf.set(TableInputFormat.INPUT_TABLE, tableName) hbaseConf.set(TableInputFormat.SCAN, HBaseUtils.convertScanToString(scan)) // 将指定范围内的hbase数据转为rdd val resultRDD = this.spark.sparkContext.newAPIHadoopRDD(hbaseConf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result]).repartition(FireHBaseConf.hbaseHadoopScanPartitions(this.keyNum)).persist(StorageLevel.fromString(FireHBaseConf.hbaseStorageLevel(this.keyNum))) resultRDD } /** * Scan指定HBase表的数据,并映射为RDD[(ImmutableBytesWritable, Result)] * * @param tableName * HBase表名 * @param startRow * rowKey开始位置 * @param stopRow * rowKey结束位置 * 目标类型 * @return */ def hbaseHadoopScanRS2(tableName: String, startRow: String, stopRow: String): RDD[(ImmutableBytesWritable, Result)] = { this.hbaseHadoopScanRS(tableName, HBaseConnector.buildScan(startRow, stopRow)) } /** * Scan指定HBase表的数据,并映射为RDD[(T] * * @param tableName * HBase表名 * @param scan * scan对象 * 目标类型 * @return */ def hbaseHadoopScanRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan): RDD[T] = { val rdd = this.hbaseHadoopScanRS(tableName, scan) rdd.mapPartitions(it => HBaseConnector(keyNum = keyNum).hbaseRow2BeanList(it, clazz)) } /** * Scan指定HBase表的数据,并映射为RDD[T] * * @param tableName * HBase表名 * @param startRow * rowKey开始位置 * @param stopRow * rowKey结束位置 * 目标类型 * @return */ def hbaseHadoopScanRDD2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String): RDD[T] = { this.hbaseHadoopScanRDD[T](tableName, clazz, HBaseConnector.buildScan(startRow, stopRow)) } /** * Scan指定HBase表的数据,并映射为RDD[(T] * * @param tableName * HBase表名 * @param scan * scan对象 * 目标类型 * @return */ def hbaseHadoopScanDF[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan): DataFrame = { val rdd = this.hbaseHadoopScanRDD[T](tableName, clazz, scan) this.spark.createDataFrame(rdd, clazz) } /** * Scan指定HBase表的数据,并映射为RDD[(ImmutableBytesWritable, Result)] * * @param tableName * HBase表名 * @param startRow * rowKey开始位置 * @param stopRow * rowKey结束位置 * 目标类型 * @return */ def hbaseHadoopScanDF2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String): DataFrame = { this.hbaseHadoopScanDF[T](tableName, clazz, HBaseConnector.buildScan(startRow, stopRow)) } /** * Scan指定HBase表的数据,并映射为RDD[(T] * * @param tableName * HBase表名 * @param scan * scan对象 * 目标类型 * @return */ def hbaseHadoopScanDS[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan): Dataset[T] = { val rdd = this.hbaseHadoopScanRDD[T](tableName, clazz, scan) this.spark.createDataset(rdd)(Encoders.bean(clazz)) } /** * Scan指定HBase表的数据,并映射为RDD[(ImmutableBytesWritable, Result)] * * @param tableName * HBase表名 * @param startRow * rowKey开始位置 * @param stopRow * rowKey结束位置 * 目标类型 * @return */ def hbaseHadoopScanDS2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String): Dataset[T] = { this.hbaseHadoopScanDS[T](tableName, clazz, HBaseConnector.buildScan(startRow, stopRow)) } /** * Scan指定HBase表的数据,并映射为RDD[(ImmutableBytesWritable, Result)] * * @param tableName * HBase表名 * @param startRow * rowKey开始位置 * @param stopRow * rowKey结束位置 * 目标类型 * @return */ def hbaseScanDF2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String): DataFrame = { this.hbaseScanDF(tableName, clazz, HBaseConnector.buildScan(startRow, stopRow)) } /** * Scan指定HBase表的数据,并映射为RDD[(ImmutableBytesWritable, Result)] * * @param tableName * HBase表名 * @param scan * HBase scan对象 * @return */ def hbaseScanRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan): RDD[T] = { HBaseConnector(keyNum = this.keyNum).setScanMaxVersions[T](scan) val hbaseRDD = this.hbaseHadoopScanRS(tableName, scan) val scanRDD = hbaseRDD.mapPartitions(it => { if (HBaseConnector(keyNum = this.keyNum).getMultiVersion[T]) { HBaseConnector(keyNum = keyNum).hbaseMultiVersionRow2BeanList[T](it, clazz) } else { HBaseConnector(keyNum = keyNum).hbaseRow2BeanList(it, clazz) } }).persist(StorageLevel.fromString(FireHBaseConf.hbaseStorageLevel(this.keyNum))) scanRDD } /** * Scan指定HBase表的数据,并映射为List * * @param tableName * HBase表名 * @param scan * hbase scan对象 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseScanList[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan): Seq[T] = { HBaseConnector(keyNum = this.keyNum).scan(tableName, clazz, scan) } /** * Scan指定HBase表的数据,并映射为List * * @param tableName * HBase表名 * @param startRow * 开始主键 * @param stopRow 结束主键 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseScanList2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String): Seq[T] = { this.hbaseScanList[T](tableName, clazz, HBaseConnector.buildScan(startRow, stopRow)) } /** * 通过RDD[String]批量获取对应的数据(可获取历史版本的记录) * * @param rowKeyRDD * rdd中存放了待查询的rowKey集合 * @param tableName * HBase表名 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseGetRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], rowKeyRDD: RDD[String]): RDD[T] = { val getRDD = rowKeyRDD.mapPartitions(it => { val beanList = ListBuffer[T]() val getList = ListBuffer[Get]() it.foreach(rowKey => { if (StringUtils.isNotBlank(rowKey)) { val get = new Get(rowKey.getBytes(StandardCharsets.UTF_8)) getList += get if (getList.size >= this.batchSize) { beanList ++= HBaseConnector(keyNum = this.keyNum).get(tableName, clazz, getList: _*) getList.clear() } } }) if (getList.nonEmpty) { beanList ++= HBaseConnector(keyNum = this.keyNum).get(tableName, clazz, getList: _*) getList.clear() } beanList.iterator }).persist(StorageLevel.fromString(FireHBaseConf.hbaseStorageLevel(this.keyNum))) getRDD } /** * 通过RDD[String]批量获取对应的数据(可获取历史版本的记录) * * @param rowKeyRDD * rdd中存放了待查询的rowKey集合 * @param tableName * HBase表名 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseGetDF[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], rowKeyRDD: RDD[String]): DataFrame = { this.spark.createDataFrame(hbaseGetRDD(tableName, clazz, rowKeyRDD), clazz) } /** * 通过RDD[String]批量获取对应的数据(可获取历史版本的记录) * * @param rowKeyRDD * rdd中存放了待查询的rowKey集合 * @param tableName * HBase表名 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseGetDS[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], rowKeyRDD: RDD[String]): Dataset[T] = { this.spark.createDataset(hbaseGetRDD(tableName, clazz, rowKeyRDD))(Encoders.bean(clazz)) } /** * 使用hbase java api方式插入一个集合的数据到hbase表中 * * @param tableName * hbase表名 * @param seq * HBaseBaseBean的子类集合 */ def hbasePutList[T <: HBaseBaseBean[T] : ClassTag](tableName: String, seq: Seq[T]): Unit = { HBaseConnector(keyNum = this.keyNum).insert[T](tableName, seq: _*) } /** * 根据rowKey查询数据,并转为List[T] * * @param tableName * hbase表名 * @param seq * rowKey集合 * @param clazz * 目标类型 * get的版本数 * @return * List[T] */ def hbaseGetList[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], seq: Seq[Get]): Seq[T] = { HBaseConnector(keyNum = this.keyNum).get[T](tableName, clazz, seq: _*) } /** * 根据rowKey查询数据,并转为List[T] * * @param tableName * hbase表名 * @param seq * rowKey集合 * @param clazz * 目标类型 * @return * List[T] */ def hbaseGetList2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], seq: Seq[String]): Seq[T] = { val getList = ListBuffer[Get]() seq.filter(StringUtils.isNotBlank).foreach(rowKey => { getList += new Get(rowKey.getBytes(StandardCharsets.UTF_8)) }) this.hbaseGetList[T](tableName, clazz, getList) } /** * 根据rowKey集合批量删除记录 * * @param tableName * hbase表名 * @param rowKeys * rowKey集合 */ def hbaseDeleteList(tableName: String, rowKeys: Seq[String]): Unit = { HBaseConnector(keyNum = this.keyNum).deleteRows(tableName, rowKeys: _*) } /** * 根据RDD[RowKey]批量删除记录 * * @param tableName * hbase表名 * @param rowKeyRDD * rowKey集合 */ def hbaseDeleteRDD(tableName: String, rowKeyRDD: RDD[String]): Unit = { rowKeyRDD.foreachPartition(it => { val rowKeyList = ListBuffer[String]() var count = 0 it.foreach(rowKey => { if (StringUtils.isNotBlank(rowKey)) { rowKeyList += rowKey count += rowKeyList.size } if (rowKeyList.size >= batchSize) { HBaseConnector(keyNum = this.keyNum).deleteRows(tableName, rowKeyList: _*) rowKeyList.clear() } }) if (rowKeyList.nonEmpty) { HBaseConnector(keyNum = this.keyNum).deleteRows(tableName, rowKeyList: _*) rowKeyList.clear() } }) } /** * 根据Dataset[RowKey]批量删除记录 * * @param tableName * hbase表名 * @param dataSet * rowKey集合 */ def hbaseDeleteDS(tableName: String, dataSet: Dataset[String]): Unit = { this.hbaseDeleteRDD(tableName, dataSet.rdd) } /** * 按照指定的批次大小分多个批次插入数据到hbase中 * * @param tableName * hbase表名 * @param iterator * 数据集迭代器 */ private def multiBatchInsert[E <: HBaseBaseBean[E] : ClassTag](tableName: String, iterator: Iterator[E]): Unit = { var count = 0 val list = ListBuffer[E]() iterator.foreach(bean => { list += bean if (list.size >= batchSize) { HBaseConnector(keyNum = this.keyNum).insert[E](tableName, list: _*) count += list.size list.clear() } }) if (list.nonEmpty) HBaseConnector(keyNum = this.keyNum).insert[E](tableName, list: _*) count += list.size list.clear() } } /** * 用于单例构建伴生类HBaseSparkBridge的实例对象 * 每个HBaseSparkBridge实例使用keyNum作为标识,并且与每个HBase集群一一对应 */ object HBaseSparkBridge extends ConnectorFactory[HBaseSparkBridge] { /** * 约定创建connector子类实例的方法 */ override protected def create(conf: Any = null, keyNum: Int = 1): HBaseSparkBridge = { requireNonEmpty(keyNum) val connector = new HBaseSparkBridge(keyNum) logger.debug(s"创建HBaseSparkBridge实例成功. keyNum=$keyNum") connector } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/ext/core/DStreamExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.ext.core import com.zto.fire._ import com.zto.fire.common.util.{ExceptionBus, Logging} import com.zto.fire.hbase.bean.HBaseBaseBean import com.zto.fire.spark.connector.HBaseBulkConnector import com.zto.fire.spark.util.SparkSingletonFactory import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.rocketmq.common.message.MessageExt import org.apache.rocketmq.spark.{CanCommitOffsets => RocketCanCommitOffsets} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges} import scala.reflect._ import scala.util.Try /** * DStream扩展 * * @param stream * stream对象 * @author ChengLong 2019-5-18 11:06:56 */ class DStreamExt[T: ClassTag](stream: DStream[T]) extends Logging { private[this] lazy val spark = SparkSingletonFactory.getSparkSession /** * DStrea数据实时写入 * * @param tableName * HBase表名 */ def hbaseBulkPutStream[T <: HBaseBaseBean[T] : ClassTag](tableName: String, keyNum: Int = 1): Unit = { HBaseBulkConnector.bulkPutStream(tableName, stream.asInstanceOf[DStream[T]], keyNum) } /** * 清空RDD的缓存 */ def uncache: Unit = { stream.persist(StorageLevel.NONE) } /** * 维护kafka的offset,生产下可能会导致丢数据的风险 * use rdd.kafkaCommitOffsets(dStream) */ @deprecated("rdd.kafkaCommitOffsets", since = "2.2.0") def kafkaCommitOffsets[T <: ConsumerRecord[String, String]]: Unit = { stream.asInstanceOf[DStream[T]].foreachRDD { rdd => try { val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges) } catch { case e: Exception => e.printStackTrace() } } } /** * 维护RocketMQ的offset,生产下可能会导致丢数据的风险 * use rdd.rocketCommitOffsets(dStream) */ @deprecated("rdd.rocketCommitOffsets", since = "2.2.0") def rocketCommitOffsets[T <: MessageExt]: Unit = { stream.asInstanceOf[DStream[T]].foreachRDD { rdd => if (!rdd.isEmpty()) { try { val offsetRanges = rdd.asInstanceOf[org.apache.rocketmq.spark.HasOffsetRanges].offsetRanges stream.asInstanceOf[org.apache.rocketmq.spark.CanCommitOffsets].commitAsync(offsetRanges) } catch { case e: Exception => e.printStackTrace() } } } } /** * 至少一次的语义保证,当rdd处理成功时提交offset,当处理失败时重试指定的次数 * 该算子支持识别kafka和rocketmq的源,并在执行成功的情况下提交offset * 注:必须在最原始的DStream上调用该算子,不能经过任何的transform转换,否则会报错 * * @param process * rdd的处理逻辑 * @param reTry * rdd处理失败重试的次数 * @param exitOnFailure * 当重试多次仍失败时是否退出 */ def foreachRDDAtLeastOnce(process: RDD[T] => Unit)(implicit reTry: Int = 3, duration: Long = 3000, autoCommit: Boolean = true, exitOnFailure: Boolean = true): Unit = { this.stream.foreachRDD((rdd, batchTime) => { // 用户的业务逻辑处理,对于处理失败的RDD重试指定的次数 val retValue = Try { try { retry(reTry, duration) { process(rdd) } } } // 根据rdd处理的成功与否决定是否提交offset或退出任务 if (retValue.isSuccess) { if (autoCommit) { this.stream match { // 提交kafka的offset case dstream: CanCommitOffsets => { rdd.kafkaCommitOffsets(dstream.asInstanceOf[DStream[ConsumerRecord[String, String]]]) this.logger.info(s"批次[${batchTime}]执行成功,kafka offset提交成功") } // 提交rocketmq的offset case dstream: RocketCanCommitOffsets => { rdd.rocketCommitOffsets(dstream.asInstanceOf[InputDStream[MessageExt]]) this.logger.info(s"批次[${batchTime}]执行成功,rocketmq offset提交成功") } case _ => throw new IllegalArgumentException("DStream必须为最原始的source流,不能经过transformation算子做转换!") } } } else if (exitOnFailure) { this.logger.error(s"批次[${batchTime}]执行失败,offset未提交,任务将退出") this.logger.error(s"异常堆栈:${ExceptionBus.stackTrace(retValue.failed.get)}") SparkSingletonFactory.getStreamingContext.stop(true, false) } }) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/ext/core/DataFrameExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.ext.core import com.zto.fire.common.util.{Logging, ValueUtils} import com.zto.fire.hbase.bean.HBaseBaseBean import com.zto.fire.jdbc.JdbcConnector import com.zto.fire.jdbc.conf.FireJdbcConf import com.zto.fire.jdbc.util.DBUtils import com.zto.fire.spark.conf.FireSparkConf import com.zto.fire.spark.connector.{HBaseBulkConnector, HBaseSparkBridge} import com.zto.fire.spark.util.SparkUtils import org.apache.commons.lang3.StringUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.storage.StorageLevel import java.util.Properties import scala.collection.mutable.ListBuffer import scala.reflect._ /** * DataFrame扩展 * * @param dataFrame * dataFrame实例 */ class DataFrameExt(dataFrame: DataFrame) extends Logging { /** * 注册为临时表的同时缓存表 * * @param tmpTableName * 临时表名 * @param storageLevel * 指定存储级别 * @return * 生成的DataFrame */ def createOrReplaceTempViewCache(tmpTableName: String, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER): DataFrame = { if (StringUtils.isNotBlank(tmpTableName)) { dataFrame.createOrReplaceTempView(tmpTableName) this.dataFrame.sparkSession.catalog.cacheTable(tmpTableName, storageLevel) } dataFrame } /** * 保存Hive表 * * @param saveMode * 保存模式,默认为Overwrite * @param partitionName * 分区字段 * @param tableName * 表名 * @return * 生成的DataFrame */ def saveAsHiveTable(tableName: String, partitionName: String, saveMode: SaveMode = SaveMode.valueOf(FireSparkConf.saveMode)): DataFrame = { if (StringUtils.isNotBlank(tableName)) { if (StringUtils.isNotBlank(partitionName)) { dataFrame.write.mode(saveMode).partitionBy(partitionName).saveAsTable(tableName) } else { dataFrame.write.mode(saveMode).saveAsTable(tableName) } } dataFrame } /** * 将DataFrame数据保存到关系型数据库中 * * @param tableName * 关系型数据库表名 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 * @return */ def jdbcTableSave(tableName: String, saveMode: SaveMode = SaveMode.Append, jdbcProps: Properties = null, keyNum: Int = 1): Unit = { dataFrame.write.mode(saveMode).jdbc(FireJdbcConf.jdbcUrl(keyNum), tableName, DBUtils.getJdbcProps(jdbcProps, keyNum)) } /** * 将DataFrame中指定的列写入到jdbc中 * 调用者需自己保证DataFrame中的列类型与关系型数据库对应字段类型一致 * * @param sql * 关系型数据库待执行的增删改sql * @param fields * 指定部分DataFrame列名作为参数,顺序要对应sql中问号占位符的顺序 * 若不指定字段,则默认传入当前DataFrame所有列,且列的顺序与sql中问号占位符顺序一致 * @param batch * 每个批次执行多少条 * @param keyNum * 对应配置文件中指定的数据源编号 */ def jdbcBatchUpdate(sql: String, fields: Seq[String] = null, batch: Int = FireJdbcConf.batchSize(), keyNum: Int = 1): Unit = { if (ValueUtils.isEmpty(sql)) { logger.error("执行jdbcBatchUpdate失败,sql语句不能为空") return } if (dataFrame.isStreaming) { // 如果是streaming流 dataFrame.writeStream.format("fire-jdbc") .option("checkpointLocation", FireSparkConf.chkPointDirPrefix) .option("sql", sql) .option("batch", batch) .option("keyNum", keyNum) .option("fields", if (fields != null) fields.mkString(",") else "") .start() } else { // 非structured streaming调用 dataFrame.foreachPartition((it: Iterator[Row]) => { var count: Int = 0 val list = ListBuffer[ListBuffer[Any]]() var params: ListBuffer[Any] = null it.foreach(row => { count += 1 params = ListBuffer[Any]() if (ValueUtils.noEmpty(fields)) { // 若调用者指定了某些列,则取这些列的数据 fields.foreach(field => { val index = row.fieldIndex(field) params += row.get(index) }) } else { // 否则取当前DataFrame全部的列,顺序要与sql问号占位符保持一致 (0 until row.size).foreach(index => { params += row.get(index) }) } list += params // 分批次执行 if (count == batch) { JdbcConnector.executeBatch(sql, list, keyNum = keyNum) count = 0 list.clear() } }) // 将剩余的数据一次执行掉 if (list.nonEmpty) { JdbcConnector.executeBatch(sql, list, keyNum = keyNum) list.clear() } }) } } /** * 批量写入,将自定义的JavaBean数据集批量并行写入 * 到HBase的指定表中。内部会将自定义JavaBean的相应 * 字段一一映射为Put对象,并完成一次写入 * * @param tableName * HBase表名 * @tparam T * 数据类型为HBaseBaseBean的子类 */ def hbaseBulkPutDF[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], keyNum: Int = 1): Unit = { HBaseBulkConnector.bulkPutDF[T](tableName, dataFrame, clazz, keyNum) } /** * 以spark 方式批量将DataFrame数据写入到hbase中 * 注:此方法与hbaseHadoopPutDF不同之处在于,它不强制要求该DataFrame一定要与HBaseBaseBean的子类对应 * 但需要指定rowKey的构建规则,相对与hbaseHadoopPutDF来说,少了中间的两次转换,性能会更高 * * @param tableName * hbase表名 * @tparam T * JavaBean类型 */ def hbaseHadoopPutDFRow[T <: HBaseBaseBean[T] : ClassTag](tableName: String, buildRowKey: (Row) => String, keyNum: Int = 1): Unit = { HBaseBulkConnector.hadoopPutDFRow[T](tableName, dataFrame, buildRowKey, keyNum) } /** * 使用spark API的方式将DataFrame中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 * @param clazz * JavaBean类型,为HBaseBaseBean的子类 */ def hbaseHadoopPutDF[E <: HBaseBaseBean[E] : ClassTag](tableName: String, clazz: Class[E], keyNum: Int = 1): Unit = { HBaseBulkConnector.hadoopPutDF[E](tableName, dataFrame, clazz, keyNum) } /** * 使用Java API的方式将DataFrame中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 * @param clazz * JavaBean类型,为HBaseBaseBean的子类 */ def hbasePutDF[E <: HBaseBaseBean[E] : ClassTag](tableName: String, clazz: Class[E], keyNum: Int = 1): Unit = { HBaseSparkBridge(keyNum = keyNum).hbasePutDF(tableName, clazz, this.dataFrame) } /** * 将DataFrame注册为临时表,并缓存表 * * @param tableName * 临时表名 */ def dataFrameRegisterAndCache(tableName: String): Unit = { if (StringUtils.isBlank(tableName)) throw new IllegalArgumentException("临时表名不能为空") dataFrame.createOrReplaceTempView(tableName) dataFrame.sqlContext.cacheTable(tableName) } /** * 将DataFrame映射为指定JavaBean类型的RDD * * @param clazz * @return */ def toRDD[E <: Object : ClassTag](clazz: Class[E], toUppercase: Boolean = false): RDD[E] = { this.dataFrame.rdd.mapPartitions(it => SparkUtils.sparkRowToBean(it, clazz, toUppercase)) } /** * 将DataFrame的schema转为小写 * * @return */ def toLowerDF: DataFrame = { this.dataFrame.selectExpr(SparkUtils.schemaToLowerCase(this.dataFrame.schema): _*) } /** * 清空RDD的缓存 */ def uncache: Unit = { dataFrame.unpersist() } /** * 将实时流转为静态DataFrame * * @return * 静态DataFrame */ def toExternalRow: DataFrame = { if (this.dataFrame.isStreaming) SparkUtils.toExternalRow(dataFrame) else this.dataFrame } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/ext/core/DatasetExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.ext.core import com.zto.fire._ import com.zto.fire.common.util.Logging import com.zto.fire.hbase.bean.HBaseBaseBean import com.zto.fire.spark.conf.FireSparkConf import com.zto.fire.spark.connector.{HBaseBulkConnector, HBaseSparkBridge} import com.zto.fire.spark.util.SparkUtils import org.apache.spark.sql._ import org.apache.spark.sql.streaming.Trigger import scala.collection.mutable.ListBuffer import scala.reflect._ /** * Dataset扩展 * * @param dataset * dataset对象 * @author ChengLong 2019-5-18 11:02:56 */ class DatasetExt[T: ClassTag](dataset: Dataset[T]) extends Logging { /** * 用于检查当前Dataset是否为空 * * @return * true: 为空 false:不为空 */ def isEmpty: Boolean = dataset.rdd.isEmpty() /** * 用于检查当前Dataset是否不为空 * * @return * true: 不为空 false:为空 */ def isNotEmpty: Boolean = !this.isEmpty /** * 打印Dataset的值 * * @param lines * 打印的行数 * @return */ def showString(lines: Int = 1000): String = { val showLines = if (lines <= 1000) lines else 1000 val showStringMethod = dataset.getClass.getDeclaredMethod("showString", classOf[Int], classOf[Int], classOf[Boolean]) showStringMethod.invoke(dataset, Integer.valueOf(showLines), Integer.valueOf(Int.MaxValue), java.lang.Boolean.valueOf(false)).toString } /** * 批量写入,将自定义的JavaBean数据集批量并行写入 * 到HBase的指定表中。内部会将自定义JavaBean的相应 * 字段一一映射为Put对象,并完成一次写入 * * @param tableName * HBase表名 * @tparam T * 数据类型为HBaseBaseBean的子类 */ def hbaseBulkPutDS[T <: HBaseBaseBean[T] : ClassTag](tableName: String, keyNum: Int = 1): Unit = { HBaseBulkConnector.bulkPutDS[T](tableName, dataset.asInstanceOf[Dataset[T]], keyNum) } /** * 根据Dataset[String]批量删除,Dataset是rowkey的集合 * 类型为String * * @param tableName * HBase表名 */ def hbaseBulkDeleteDS(tableName: String, keyNum: Int = 1): Unit = { HBaseBulkConnector.bulkDeleteDS(tableName, dataset.asInstanceOf[Dataset[String]], keyNum) } /** * 根据Dataset[RowKey]批量删除记录 * * @param tableName * rowKey集合 */ def hbaseDeleteDS(tableName: String, keyNum: Int = 1): Unit = { HBaseSparkBridge(keyNum = keyNum).hbaseDeleteDS(tableName, dataset.asInstanceOf[Dataset[String]]) } /** * 使用spark API的方式将DataFrame中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 */ def hbaseHadoopPutDS[T <: HBaseBaseBean[T] : ClassTag](tableName: String, keyNum: Int = 1): Unit = { HBaseBulkConnector.hadoopPutDS[T](tableName, dataset.asInstanceOf[Dataset[T]], keyNum) } /** * 使用Java API的方式将Dataset中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 * @param clazz * JavaBean类型,为HBaseBaseBean的子类 */ def hbasePutDS[E <: HBaseBaseBean[E] : ClassTag](tableName: String, clazz: Class[E], keyNum: Int = 1): Unit = { HBaseSparkBridge(keyNum = keyNum).hbasePutDS[E](tableName, clazz, dataset.asInstanceOf[Dataset[E]]) } /** * 清空RDD的缓存 */ def uncache: Unit = { dataset.unpersist } /** * 将当前Dataset记录打印到控制台 */ def print(outputMode: String = "append", trigger: Trigger = null, numRows: Int = 20, truncate: Boolean = true): Dataset[T] = { if (dataset.isStreaming) { val tmpStream = dataset.writeStream.outputMode(outputMode).option("numRows", numRows).option("truncate", truncate).format("console") if (trigger != null) tmpStream.trigger(trigger) tmpStream.start } else { dataset.show(numRows, truncate) } dataset } /** * 分配次执行指定的业务逻辑 * * @param batch * 多大批次执行一次sinkFun中定义的操作 * @param mapFun * 将Row类型映射为E类型的逻辑,并将处理后的数据放到listBuffer中 * @param sinkFun * 具体处理逻辑,将数据sink到目标源 */ def foreachPartitionBatch[E](mapFun: T => E, sinkFun: ListBuffer[E] => Unit, batch: Int = 1000): Unit = { SparkUtils.datasetForeachPartitionBatch(this.dataset, mapFun, sinkFun, batch) } /** * spark datasource write api增强,提供配置文件进行覆盖配置 * * @param format * DataSource中的format * @param saveMode * DataSource中的saveMode * @param saveParam * save方法的参数,可以是路径或表名:save(path)、saveAsTable(tableName) * @param isSaveTable * true:调用saveAsTable(saveParam)方法 false:调用save(saveParam)方法 * @param options * DataSource中的options,支持参数传入和配置文件读取,相同的选项配置文件优先级更高 * @param keyNum * 用于标识不同DataSource api所对应的配置文件中key的后缀 */ def writeEnhance(format: String = "", saveMode: SaveMode = SaveMode.Append, saveParam: String = "", isSaveTable: Boolean = false, options: Map[String, String] = Map.empty, keyNum: Int = 1): Unit = { val finalFormat = if (noEmpty(FireSparkConf.datasourceFormat(keyNum))) FireSparkConf.datasourceFormat(keyNum) else format val finalSaveMode = if (noEmpty(FireSparkConf.datasourceSaveMode(keyNum))) SaveMode.valueOf(FireSparkConf.datasourceSaveMode(keyNum)) else saveMode val finalSaveParam = if (noEmpty(FireSparkConf.datasourceSaveParam(keyNum))) FireSparkConf.datasourceSaveParam(keyNum) else saveParam val finalIsSaveTable = if (noEmpty(FireSparkConf.datasourceIsSaveTable(keyNum))) FireSparkConf.datasourceIsSaveTable(keyNum).toBoolean else isSaveTable requireNonEmpty(dataset, finalFormat, finalSaveMode, finalSaveParam, finalIsSaveTable) this.logger.info(s"--> Spark DataSource write api参数信息(keyNum=$keyNum)<--") this.logger.info(s"format=${finalFormat} saveMode=${finalSaveMode} save参数=${finalSaveParam} saveToTable=${finalIsSaveTable}") val writer = dataset.write.format(finalFormat).options(SparkUtils.optionsEnhance(options, keyNum)).mode(finalSaveMode) if (!isSaveTable) { if (com.zto.fire.isEmpty(finalSaveMode)) writer.save() else writer.save(finalSaveParam) } else writer.saveAsTable(finalSaveParam) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/ext/core/RDDExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.ext.core import com.zto.fire._ import com.zto.fire.hbase.bean.HBaseBaseBean import com.zto.fire.spark.connector.{HBaseBulkConnector, HBaseSparkBridge} import com.zto.fire.spark.util.{SparkSingletonFactory, SparkUtils} import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.rocketmq.common.message.MessageExt import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.functions.from_json import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges} import scala.collection.mutable.ListBuffer import scala.reflect.{ClassTag, classTag} /** * RDD相关扩展 * * @author ChengLong 2019-5-18 10:28:31 */ class RDDExt[T: ClassTag](rdd: RDD[T]) { private lazy val spark = SparkSingletonFactory.getSparkSession import spark.implicits._ /** * 用于判断rdd是否为空 * * @return * true: 不为空 false:为空 */ def isNotEmpty: Boolean = !rdd.isEmpty() /** * 遍历每个partition并打印元素到控制台 */ def printEachPartition: Unit = { rdd.foreachPartition(it => { it.foreach(item => println(item + " ")) }) } /** * 集群模式下打印数据 */ def printEachClusterPartition: Unit = { rdd.collect().foreach(println) } /** * 将rdd转为DataFrame */ def toDF(): DataFrame = { this.spark.createDataFrame(rdd, classTag[T].runtimeClass) } /** * 将rdd转为DataFrame并注册成临时表 * * @param tableName * 表名 * @return * DataFrame */ def createOrReplaceTempView(tableName: String, cache: Boolean = false): DataFrame = { val dataFrame = this.toDF() dataFrame.createOrReplaceTempView(tableName) if (cache) this.spark.cacheTables(tableName) dataFrame } /** * 根据RDD[String]批量删除 * * @param tableName * HBase表名 */ def hbaseBulkDeleteRDD[T <: String : ClassTag](tableName: String, keyNum: Int = 1): Unit = { HBaseBulkConnector.bulkDeleteRDD(tableName, rdd.asInstanceOf[RDD[String]], keyNum) } /** * 根据RDD[RowKey]批量删除记录 * * @param tableName * rowKey集合 */ def hbaseDeleteRDD(tableName: String, keyNum: Int = 1): Unit = { HBaseSparkBridge(keyNum = keyNum).hbaseDeleteRDD(tableName, rdd.asInstanceOf[RDD[String]]) } /** * 根据rowKey集合批量获取数据 * * @param tableName * HBase表名 * @param clazz * 获取后的记录转换为目标类型 * @return * 结果集 */ def hbaseBulkGetRDD[E <: HBaseBaseBean[E] : ClassTag](tableName: String, clazz: Class[E], keyNum: Int = 1): RDD[E] = { HBaseBulkConnector.bulkGetRDD(tableName, rdd.asInstanceOf[RDD[String]], clazz, keyNum) } /** * 根据rowKey集合批量获取数据,并映射为自定义的JavaBean类型 * * @param tableName * HBase表名 * @param clazz * 获取后的记录转换为目标类型(自定义的JavaBean类型) * @tparam E * 自定义JavaBean类型,必须继承自HBaseBaseBean * @return * 自定义JavaBean的对象结果集 */ def hbaseBulkGetDF[E <: HBaseBaseBean[E] : ClassTag](tableName: String, clazz: Class[E], keyNum: Int = 1): DataFrame = { HBaseBulkConnector.bulkGetDF[E](tableName, rdd.asInstanceOf[RDD[String]], clazz, keyNum) } /** * 根据rowKey集合批量获取数据,并映射为自定义的JavaBean类型 * * @param tableName * HBase表名 * @param clazz * 获取后的记录转换为目标类型(自定义的JavaBean类型) * @tparam E * 自定义JavaBean类型,必须继承自HBaseBaseBean * @return * 自定义JavaBean的对象结果集 */ def hbaseBulkGetDS[E <: HBaseBaseBean[E] : ClassTag](tableName: String, clazz: Class[E], keyNum: Int = 1): Dataset[E] = { HBaseBulkConnector.bulkGetDS[E](tableName, rdd.asInstanceOf[RDD[String]], clazz, keyNum) } /** * 批量插入数据 * * @param tableName * HBase表名 * 数据集合,继承自HBaseBaseBean */ def hbaseBulkPutRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, keyNum: Int = 1): Unit = { HBaseBulkConnector.bulkPutRDD(tableName, rdd.asInstanceOf[RDD[T]], keyNum) } /** * 使用Spark API的方式将RDD中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 */ def hbaseHadoopPutRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, keyNum: Int = 1): Unit = { HBaseBulkConnector.hadoopPut(tableName, rdd.asInstanceOf[RDD[T]], keyNum) } /** * 通过RDD[String]批量获取对应的数据(可获取历史版本的记录) * * @param tableName * HBase表名 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseGetRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], keyNum: Int = 1): RDD[T] = { HBaseSparkBridge(keyNum = keyNum).hbaseGetRDD(tableName, clazz, rdd.asInstanceOf[RDD[String]]) } /** * 通过RDD[String]批量获取对应的数据(可获取历史版本的记录) * * @param tableName * HBase表名 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseGetDS[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], keyNum: Int = 1): Dataset[T] = { HBaseSparkBridge(keyNum = keyNum).hbaseGetDS[T](tableName, clazz, rdd.asInstanceOf[RDD[String]]) } /** * 通过RDD[String]批量获取对应的数据(可获取历史版本的记录) * * @param tableName * HBase表名 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseGetDF[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], keyNum: Int = 1): DataFrame = { HBaseSparkBridge(keyNum = keyNum).hbaseGetDF(tableName, clazz, rdd.asInstanceOf[RDD[String]]) } /** * 使用Java API的方式将RDD中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 */ def hbasePutRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, keyNum: Int = 1): Unit = { HBaseSparkBridge(keyNum = keyNum).hbasePutRDD[T](tableName, rdd.asInstanceOf[RDD[T]]) } /** * 解析DStream中每个rdd的json数据,并转为DataFrame类型 * * @param schema * 目标DataFrame类型的schema * @param isMySQL * 是否为mysql解析的消息 * @param fieldNameUpper * 字段名称是否为大写 * @param parseAll * 是否需要解析所有字段信息 * @return */ def kafkaJson2DFV(schema: Class[_], parseAll: Boolean = false, isMySQL: Boolean = true, fieldNameUpper: Boolean = false): DataFrame = { val ds = this.spark.createDataset(rdd.asInstanceOf[RDD[String]])(Encoders.STRING) val df = ds.select(from_json(new ColumnName("value"), SparkUtils.buildSchema2Kafka(schema, parseAll, isMySQL, fieldNameUpper)).as("data")) if (parseAll) df.select("data.*") else df.select("data.after.*") } /** * 解析DStream中每个rdd的json数据,并转为DataFrame类型 * * @param schema * 目标DataFrame类型的schema * @param isMySQL * 是否为mysql解析的消息 * @param fieldNameUpper * 字段名称是否为大写 * @param parseAll * 是否解析所有字段信息 * @return */ def kafkaJson2DF(schema: Class[_], parseAll: Boolean = false, isMySQL: Boolean = true, fieldNameUpper: Boolean = false): DataFrame = { val ds = this.spark.createDataset(rdd.asInstanceOf[RDD[ConsumerRecord[String, String]]].map(t => t.value()))(Encoders.STRING) val structType = SparkUtils.buildSchema2Kafka(schema, parseAll, isMySQL, fieldNameUpper) val df = ds.select(from_json(new ColumnName("value"), structType).as("data")) val tmpDF = if (parseAll) df.select("data.*") else df.select("data.after.*") if (fieldNameUpper) tmpDF.toLowerDF else tmpDF } /** * 解析json数据,并注册为临时表 * * @param tableName * 临时表名 */ def kafkaJson2Table(tableName: String, cacheTable: Boolean = false): Unit = { val msgDS = rdd.asInstanceOf[RDD[ConsumerRecord[String, String]]].map(t => t.value()).toDS() this.spark.read.json(msgDS).toLowerDF.createOrReplaceTempView(tableName) if (cacheTable) this.spark.cacheTables(tableName) } /** * 清空RDD的缓存 */ def uncache: Unit = { rdd.unpersist() } /** * 维护RocketMQ的offset */ def kafkaCommitOffsets(stream: DStream[ConsumerRecord[String, String]]): Unit = { val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges) } /** * 维护RocketMQ的offset */ def rocketCommitOffsets(stream: InputDStream[MessageExt]): Unit = { val offsetRanges = rdd.asInstanceOf[org.apache.rocketmq.spark.HasOffsetRanges].offsetRanges stream.asInstanceOf[org.apache.rocketmq.spark.CanCommitOffsets].commitAsync(offsetRanges) } /** * 分配次执行指定的业务逻辑 * * @param batch * 多大批次执行一次sinkFun中定义的操作 * @param mapFun * 将Row类型映射为E类型的逻辑,并将处理后的数据放到listBuffer中 * @param sinkFun * 具体处理逻辑,将数据sink到目标源 */ def foreachPartitionBatch[E](mapFun: T => E, sinkFun: ListBuffer[E] => Unit, batch: Int = 1000): Unit = { SparkUtils.rddForeachPartitionBatch(this.rdd, mapFun, sinkFun, batch) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/ext/core/SQLContextExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.ext.core import com.zto.fire._ import com.zto.fire.common.conf.FireHiveConf import com.zto.fire.jdbc.conf.FireJdbcConf import com.zto.fire.jdbc.util.DBUtils import com.zto.fire.spark.conf.FireSparkConf import org.apache.commons.lang3.StringUtils import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} import java.util.Properties /** * SQLContext与HiveContext扩展 * * @param sqlContext * sqlContext对象 * @author ChengLong 2019-5-18 10:52:00 */ class SQLContextExt(sqlContext: SQLContext) { /** * 链式设置 * * @return * SQLContext对象 */ def set(key: String, value: String): SQLContext = { sqlContext.setConf(key, value) sqlContext } /** * 执行一段Hive QL语句,注册为临时表,持久化到hive中 * * @param sqlStr * SQL语句 * @param tmpTableName * 临时表名 * @param saveMode * 持久化的模式,默认为Overwrite * @param cache * 默认缓存表 * @return * 生成的DataFrame */ def sqlForPersistent(sqlStr: String, tmpTableName: String, partitionName: String, saveMode: SaveMode = SaveMode.valueOf(FireSparkConf.saveMode), cache: Boolean = true): DataFrame = { val dataFrame = sqlContext.sql(sqlStr) val dataFrameWriter = dataFrame.write.mode(saveMode) if (StringUtils.isNotBlank(partitionName)) { dataFrameWriter.partitionBy(partitionName).saveAsTable(tmpTableName) } else { dataFrameWriter.saveAsTable(tmpTableName) } dataFrame } /** * 执行一段Hive QL语句,注册为临时表,并cache * * @param sqlStr * SQL语句 * @param tmpTableName * 临时表名 * @return * 生成的DataFrame */ def sqlForCache(sqlStr: String, tmpTableName: String): DataFrame = { val dataFrame = sqlContext.sql(sqlStr) dataFrame.createOrReplaceTempView(tmpTableName) sqlContext.cacheTable(tmpTableName) dataFrame } /** * 执行一段Hive QL语句,注册为临时表 * * @param sqlStr * SQL语句 * @param tmpTableName * 临时表名 * @return * 生成的DataFrame */ def sqlNoCache(sqlStr: String, tmpTableName: String): DataFrame = { val dataFrame = sqlContext.sql(sqlStr) dataFrame.createOrReplaceTempView(tmpTableName) dataFrame } /** * 批量清空多张缓存表 * * @param tables * 多个表名 */ def uncacheTables(tables: String*): Unit = { if (noEmpty(tables)) { tables.filter(StringUtils.isNotBlank).foreach(tableName => { if (sqlContext.isCached(tableName)) { sqlContext.uncacheTable(tableName) } }) } } /** * 批量缓存多张表 * * @param tables * 多个表名 */ def cacheTables(tables: String*): Unit = { tables.foreach(tableName => { sqlContext.cacheTable(tableName) }) } /** * 删除指定的hive表 * * @param tableNames * 多个表名 */ def dropHiveTable(tableNames: String*): Unit = { if (noEmpty(tableNames)) { tableNames.filter(StringUtils.isNotBlank).foreach(tableName => { sqlContext.sql(s"DROP TABLE IF EXISTS $tableName") }) } } /** * 为指定表添加分区 * * @param tableName * 表名 * @param partitions * 分区 */ def addPartitions(tableName: String, partitions: String*): Unit = { if (noEmpty(tableName, partitions)) { partitions.foreach(ds => { this.addPartition(tableName, ds, FireHiveConf.partitionName) }) } } /** * 为指定表添加分区 * * @param tableName * 表名 * @param partition * 分区 * @param partitionName * 分区字段名称,默认ds */ def addPartition(tableName: String, partition: String, partitionName: String = FireHiveConf.partitionName): Unit = { if (noEmpty(tableName, partition, partitionName)) { sqlContext.sql(s"ALTER TABLE $tableName ADD IF NOT EXISTS partition($partitionName='$partition')") } } /** * 为指定表删除分区 * * @param tableName * 表名 * @param partition * 分区 */ def dropPartition(tableName: String, partition: String, partitionName: String = FireHiveConf.partitionName): Unit = { if (noEmpty(tableName, partition, partitionName)) { sqlContext.sql(s"ALTER TABLE $tableName DROP IF EXISTS partition($partitionName='$partition')") } } /** * 为指定表删除多个分区 * * @param tableName * 表名 * @param partitions * 分区 */ def dropPartitions(tableName: String, partitions: String*): Unit = { if (StringUtils.isNotBlank(tableName) && partitions != null) { partitions.foreach(ds => { this.dropPartition(tableName, ds, FireHiveConf.partitionName) }) } } /** * 根据给定的表创建新表 * * @param srcTableName * 源表名 * @param destTableName * 目标表名 */ def createTableAsSelect(srcTableName: String, destTableName: String): Unit = { if (StringUtils.isNotBlank(srcTableName) && StringUtils.isNotBlank(destTableName)) { sqlContext.sql( s""" |CREATE TABLE IF NOT EXISTS $destTableName AS |SELECT * FROM $srcTableName """.stripMargin) } } /** * 根据一张表创建另一张表 * * @param tableName * 表名 * @param destTableName * 目标表名 */ def createTableLike(tableName: String, destTableName: String): Unit = { if (StringUtils.isNotBlank(tableName) && StringUtils.isNotBlank(destTableName)) { sqlContext.sql( s""" |create table $tableName like $destTableName """.stripMargin) } } /** * 根据给定的表创建新表 * * @param srcTableName * 来源表 * @param destTableName * 目标表 * @param cols * 多个列,逗号分隔 */ def createTableAsSelectFields(srcTableName: String, destTableName: String, cols: String): Unit = { if (StringUtils.isNotBlank(srcTableName) && StringUtils.isNotBlank(destTableName) && StringUtils.isNotBlank(cols)) { sqlContext.sql( s""" |CREATE TABLE IF NOT EXISTS $destTableName AS |SELECT $cols FROM $srcTableName """.stripMargin) } } /** * 将数据插入到指定表的分区中 * * @param srcTableName * 来源表 * @param destTableName * 目标表 * @param ds * 分区名 * @param cols * 多个列,逗号分隔 */ def insertIntoPartition(srcTableName: String, destTableName: String, ds: String, cols: String, partitionName: String = FireHiveConf.partitionName): Unit = { sqlContext.sql( s""" |INSERT INTO TABLE $destTableName partition($partitionName='$ds') | SELECT $cols | FROM $srcTableName """.stripMargin) } /** * 将sql执行结果插入到目标表指定分区中 * * @param destTableName * 目标表名 * @param ds * 分区名 * @param querySQL * 查询语句 */ def insertIntoPartitionAsSelect(destTableName: String, ds: String, querySQL: String, partitionName: String = FireHiveConf.partitionName, overwrite: Boolean = false): Unit = { val overwriteVal = if (overwrite) "OVERWRITE" else "INTO" sqlContext.sql( s""" |INSERT $overwriteVal TABLE $destTableName partition($partitionName='$ds') | $querySQL """.stripMargin) } /** * 将sql执行结果插入到目标表指定分区中 * * @param destTableName * 目标表名 * @param querySQL * 查询语句 */ def insertIntoDymPartitionAsSelect(destTableName: String, querySQL: String, partitionName: String = FireHiveConf.partitionName): Unit = { sqlContext.sql( s""" |INSERT INTO TABLE $destTableName partition($partitionName) | $querySQL """.stripMargin) } /** * 修改表名 * * @param oldTableName * 表名称 * @param newTableName * 新的表名 */ def rename(oldTableName: String, newTableName: String): Unit = { if (StringUtils.isBlank(oldTableName) || StringUtils.isBlank(newTableName)) { return } val sql = s"ALTER TABLE $oldTableName RENAME TO $newTableName" sqlContext.sql(sql) } /** * 将表从一个db移动到另一个db中 * * @param tableName * 表名 * @param oldDB * 老库名称 * @param newDB * 新库名称 */ def moveDB(tableName: String, oldDB: String, newDB: String): Unit = { if (StringUtils.isBlank(tableName) || StringUtils.isBlank(newDB)) { return } val allName = if (StringUtils.isNotBlank(oldDB) && tableName.indexOf(".") == -1) { s"$oldDB.$tableName" } else { tableName } this.dropHiveTable(s"$newDB.$tableName") val sql = s"ALTER TABLE $allName RENAME TO $newDB.$tableName" println(sql) sqlContext.sql(sql) } // ----------------------------------- 关系型数据库API ----------------------------------- // /** * 单线程加载一张关系型数据库表 * 注:仅限用于小的表,不支持条件查询 * * @param tableName * 关系型数据库表名 * @param jdbcProps * 调用者指定的数据库连接信息,如果为空,则默认读取配置文件 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 * @return * DataFrame */ def jdbcTableLoadAll(tableName: String, jdbcProps: Properties = null, keyNum: Int = 1): DataFrame = { sqlContext.read.jdbc(FireJdbcConf.jdbcUrl(keyNum), tableName, DBUtils.getJdbcProps(jdbcProps, keyNum)) } /** * 指定load的条件,从关系型数据库中并行的load数据,并转为DataFrame * * @param tableName 数据库表名 * @param predicates * 并行load数据时,每一个分区load数据的where条件 * 比如:gmt_create >= '2019-06-20' AND gmt_create <= '2019-06-21' 和 gmt_create >= '2019-06-22' AND gmt_create <= '2019-06-23' * 那么将两个线程同步load,线程数与predicates中指定的参数个数保持一致 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 * @return * 查询结果集 */ def jdbcTableLoad(tableName: String, predicates: Array[String], jdbcProps: Properties = null, keyNum: Int = 1): DataFrame = { sqlContext.read.jdbc(FireJdbcConf.jdbcUrl(keyNum), tableName, predicates, DBUtils.getJdbcProps(jdbcProps, keyNum)) } /** * 根据指定分区字段的范围load关系型数据库中的数据 * * @param tableName * 表名 * @param columnName * 表的分区字段 * @param lowerBound * 分区的下边界 * @param upperBound * 分区的上边界 * @param numPartitions * 加载数据的并行度,默认为10,设置过大可能会导致数据库挂掉 * @param jdbcProps * jdbc连接信息,默认读取配置文件 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 * @return */ def jdbcTableLoadBound(tableName: String, columnName: String, lowerBound: Long, upperBound: Long, numPartitions: Int = 10, jdbcProps: Properties = null, keyNum: Int = 1): DataFrame = { sqlContext.read.jdbc(FireJdbcConf.jdbcUrl(keyNum), tableName, columnName, lowerBound, upperBound, numPartitions, DBUtils.getJdbcProps(jdbcProps, keyNum)) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/ext/core/SparkConfExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.ext.core import org.apache.spark.SparkConf /** * SparkConf扩展 * * @param sparkConf * sparkConf对象 * @author ChengLong 2019-5-18 10:50:35 */ class SparkConfExt(sparkConf: SparkConf) { } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/ext/core/SparkContextExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.ext.core import org.apache.spark.SparkContext /** * SparkContext扩展 * * @param sc * SparkContext对象 * @author ChengLong 2019-5-18 10:53:56 */ class SparkContextExt(sc: SparkContext) { } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/ext/core/SparkSessionExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.ext.core import com.zto.fire._ import com.zto.fire.core.Api import com.zto.fire.jdbc.JdbcConnectorBridge import com.zto.fire.spark.bean.GenerateBean import com.zto.fire.spark.conf.FireSparkConf import com.zto.fire.spark.connector.{BeanGenReceiver, DataGenReceiver} import com.zto.fire.spark.ext.provider._ import com.zto.fire.spark.util.{SparkSingletonFactory, SparkUtils} import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.rocketmq.common.message.MessageExt import org.apache.rocketmq.spark.{ConsumerStrategy, LocationStrategy} import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream} import org.apache.spark.streaming.receiver.Receiver import java.io.InputStream import scala.collection.mutable import scala.reflect.ClassTag /** * SparkContext扩展 * * @param spark * sparkSession对象 * @author ChengLong 2019-5-18 10:51:19 */ class SparkSessionExt(spark: SparkSession) extends Api with JdbcConnectorBridge with JdbcSparkProvider with HBaseBulkProvider with SqlProvider with HBaseConnectorProvider with HBaseHadoopProvider with KafkaSparkProvider { private[fire] lazy val ssc = SparkSingletonFactory.getStreamingContext private[this] lazy val appName = ssc.sparkContext.appName /** * 根据给定的集合,创建rdd * * @param seq * seq * @param numSlices * 分区数 * @return * RDD */ def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = sc.defaultParallelism): RDD[T] = { this.sc.parallelize(seq, numSlices) } /** * 根据给定的集合,创建rdd * * @param seq * seq * @param numSlices * 分区数 * @return * RDD */ def createRDD[T: ClassTag](seq: Seq[T], numSlices: Int = sc.defaultParallelism): RDD[T] = { this.parallelize[T](seq, numSlices) } /** * 创建socket流 */ def createSocketStream[T: ClassTag]( hostname: String, port: Int, converter: (InputStream) => Iterator[T], storageLevel: StorageLevel ): ReceiverInputDStream[T] = { this.ssc.socketStream[T](hostname, port, converter, storageLevel) } /** * 创建socket文本流 */ def createSocketTextStream( hostname: String, port: Int, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2 ): ReceiverInputDStream[String] = { this.ssc.socketTextStream(hostname, port, storageLevel) } /** * 构建Kafka DStream流 * * @param kafkaParams * kafka参数 * @param topics * topic列表 * @return * DStream */ def createKafkaDirectStream(kafkaParams: Map[String, Object] = null, topics: Set[String] = null, groupId: String = null, keyNum: Int = 1): DStream[ConsumerRecord[String, String]] = { this.ssc.createDirectStream(kafkaParams, topics, groupId, keyNum) } /** * 构建RocketMQ拉取消息的DStream流 * * @param rocketParam * rocketMQ相关消费参数 * @param groupId * groupId * @param topics * topic列表 * @param consumerStrategy * 从何处开始消费 * @return * rocketMQ DStream */ def createRocketMqPullStream(rocketParam: JMap[String, String] = null, groupId: String = this.appName, topics: String = null, tag: String = null, consumerStrategy: ConsumerStrategy = ConsumerStrategy.lastest, locationStrategy: LocationStrategy = LocationStrategy.PreferConsistent, instance: String = "", keyNum: Int = 1): DStream[MessageExt] = { this.ssc.createRocketPullStream(rocketParam, groupId, topics, tag, consumerStrategy, locationStrategy, instance, keyNum) } /** * 创建根据指定规则生成对象实例的DataGenReceiver * * @param delay * 数据生成间隔时间(ms) * @param generateFun * 数据生成规则 * @tparam T * 生成数据的类型 * @return * ReceiverInputDStream[T] */ def createDataGenStream[T <: GenerateBean[T] : ClassTag](delay: Long = 1000, generateFun: => mutable.Buffer[T]): ReceiverInputDStream[T] = { this.receiverStream[T](new DataGenReceiver[T](delay, generateFun = generateFun)) } /** * 创建根据指定规则生成对象实例的BeanDataGenReceiver * * @param delay * 数据生成间隔时间(ms) * @tparam T * 生成数据的类型 * @return * ReceiverInputDStream[T] */ def createBeanGenStream[T <: GenerateBean[T] : ClassTag](delay: Long = 1000): ReceiverInputDStream[T] = { this.receiverStream[T](new BeanGenReceiver[T](delay)) } /** * 接受自定义receiver的数据 * * @param receiver * 自定义receiver * @tparam T * 接受的数据类型 * @return * 包装后的DStream[T] */ def receiverStream[T: ClassTag](receiver: Receiver[T]): ReceiverInputDStream[T] = { this.ssc.receiverStream[T](receiver) } /** * 启动StreamingContext */ override def start(): Unit = { if (this.ssc != null) { this.ssc.startAwaitTermination() } } /** * spark datasource read api增强,提供配置文件进行覆盖配置 * * @param format * DataSource中的format * @param loadParams * load方法的参数,多个路径以逗号分隔 * @param options * DataSource中的options,支持参数传入和配置文件读取,相同的选项配置文件优先级更高 * @param keyNum * 用于标识不同DataSource api所对应的配置文件中key的后缀 */ def readEnhance(format: String = "", loadParams: Seq[String] = null, options: Map[String, String] = Map.empty, keyNum: Int = 1): Unit = { val finalFormat = if (noEmpty(FireSparkConf.datasourceFormat(keyNum))) FireSparkConf.datasourceFormat(keyNum) else format val finalLoadParam = if (noEmpty(FireSparkConf.datasourceLoadParam(keyNum))) FireSparkConf.datasourceLoadParam(keyNum).split(",").toSeq else loadParams this.logger.info(s"--> Spark DataSource read api参数信息(keyNum=$keyNum)<--") this.logger.info(s"format=${finalFormat} loadParams=${finalLoadParam}") requireNonEmpty(finalFormat, finalLoadParam) SparkSingletonFactory.getSparkSession.read.format(format).options(SparkUtils.optionsEnhance(options, keyNum)).load(finalLoadParam: _*) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/ext/core/StreamingContextExt.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.ext.core import com.zto.fire._ import com.zto.fire.common.enu.{Operation => FOperation} import com.zto.fire.common.conf.{FireKafkaConf, FireRocketMQConf} import com.zto.fire.common.util.{LineageManager, Logging} import com.zto.fire.spark.util.{RocketMQUtils, SparkUtils} import org.apache.commons.lang3.StringUtils import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.rocketmq.common.message.MessageExt import org.apache.rocketmq.spark.{ConsumerStrategy, LocationStrategy, RocketMQConfig, RocketMqUtils} import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.kafka010.KafkaUtils /** * StreamingContext扩展 * * @param ssc * StreamingContext对象 * @author ChengLong 2019-5-18 11:03:59 */ class StreamingContextExt(ssc: StreamingContext) extends Logging { import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent private[this] lazy val appName = ssc.sparkContext.appName /** * 创建DStream流 * * @param kafkaParams * kafka参数 * @param topics * topic列表 * @return * DStream */ def createDirectStream(kafkaParams: Map[String, Object] = null, topics: Set[String] = null, groupId: String = null, keyNum: Int = 1): DStream[ConsumerRecord[String, String]] = { // kafka topic优先级:配置文件 > topics参数 val confTopic = FireKafkaConf.kafkaTopics(keyNum) val finalKafkaTopic = if (StringUtils.isNotBlank(confTopic)) SparkUtils.topicSplit(confTopic) else topics require(finalKafkaTopic != null && finalKafkaTopic.nonEmpty, s"kafka topic不能为空,请在配置文件中指定:spark.kafka.topics$keyNum") this.logger.info(s"kafka topic is $finalKafkaTopic") val confKafkaParams = com.zto.fire.common.util.KafkaUtils.kafkaParams(kafkaParams, groupId, keyNum = keyNum) require(confKafkaParams.nonEmpty, "kafka相关配置不能为空!") require(confKafkaParams.contains("bootstrap.servers"), s"kafka bootstrap.servers不能为空,请在配置文件中指定:spark.kafka.brokers.name$keyNum") require(confKafkaParams.contains("group.id"), s"kafka group.id不能为空,请在配置文件中指定:spark.kafka.group.id$keyNum") // kafka消费信息埋点 LineageManager.addMQDatasource("kafka", confKafkaParams("bootstrap.servers").toString, finalKafkaTopic.mkString("", ", ", ""), confKafkaParams("group.id").toString, FOperation.SOURCE) KafkaUtils.createDirectStream[String, String]( ssc, PreferConsistent, Subscribe[String, String](finalKafkaTopic, confKafkaParams)) } /** * 构建RocketMQ拉取消息的DStream流 * * @param rocketParam * rocketMQ相关消费参数 * @param groupId * groupId * @param topics * topic列表 * @param consumerStrategy * 从何处开始消费 * @return * rocketMQ DStream */ def createRocketPullStream(rocketParam: JMap[String, String] = null, groupId: String = this.appName, topics: String = null, tag: String = null, consumerStrategy: ConsumerStrategy = ConsumerStrategy.lastest, locationStrategy: LocationStrategy = LocationStrategy.PreferConsistent, instance: String = "", keyNum: Int = 1): DStream[MessageExt] = { // 获取topic信息,配置文件优先级高于代码中指定的 val confTopics = FireRocketMQConf.rocketTopics(keyNum) val finalTopics = if (StringUtils.isNotBlank(confTopics)) confTopics else topics require(StringUtils.isNotBlank(finalTopics), s"RocketMQ的Topics不能为空,请在配置文件中指定:spark.rocket.topics$keyNum") // 起始消费位点 val confOffset = FireRocketMQConf.rocketStartingOffset(keyNum) val finalConsumerStrategy = if (StringUtils.isNotBlank(confOffset)) RocketMQUtils.valueOfStrategy(confOffset) else consumerStrategy // 是否自动提交offset val finalAutoCommit = FireRocketMQConf.rocketEnableAutoCommit(keyNum) // groupId信息 val confGroupId = FireRocketMQConf.rocketGroupId(keyNum) val finalGroupId = if (StringUtils.isNotBlank(confGroupId)) confGroupId else groupId require(StringUtils.isNotBlank(finalGroupId), s"RocketMQ的groupId不能为空,请在配置文件中指定:spark.rocket.group.id$keyNum") // 详细的RocketMQ配置信息 val finalRocketParam = RocketMQUtils.rocketParams(rocketParam, finalGroupId, rocketNameServer = null, tag = tag, keyNum) require(!finalRocketParam.isEmpty, "RocketMQ相关配置不能为空!") require(finalRocketParam.containsKey(RocketMQConfig.NAME_SERVER_ADDR), s"RocketMQ nameserver.addr不能为空,请在配置文件中指定:spark.rocket.brokers.name$keyNum") require(finalRocketParam.containsKey(RocketMQConfig.CONSUMER_TAG), s"RocketMQ tag不能为空,请在配置文件中指定:spark.rocket.consumer.tag$keyNum") // 消费者标识 val instanceId = FireRocketMQConf.rocketInstanceId(keyNum) val finalInstanceId = if (StringUtils.isNotBlank(instanceId)) instanceId else instance if (StringUtils.isNotBlank(finalInstanceId)) finalRocketParam.put("consumer.instance", finalInstanceId) // 消费rocketmq埋点信息 LineageManager.addMQDatasource("rocketmq", finalRocketParam(RocketMQConfig.NAME_SERVER_ADDR), finalTopics, finalGroupId, FOperation.SOURCE) val inputDStream = RocketMqUtils.createMQPullStream(this.ssc, finalGroupId, finalTopics.split(",").toList, finalConsumerStrategy, finalAutoCommit, forceSpecial = FireRocketMQConf.rocketForceSpecial(keyNum), failOnDataLoss = FireRocketMQConf.rocketFailOnDataLoss(keyNum), locationStrategy, finalRocketParam) if ("*".equals(finalRocketParam(RocketMQConfig.CONSUMER_TAG))) { inputDStream } else { inputDStream.filter(msg => msg.getTags.equals(finalRocketParam(RocketMQConfig.CONSUMER_TAG))) } } /** * 开启streaming */ def startAwaitTermination(): Unit = { ssc.start() ssc.awaitTermination() Thread.currentThread().join() } /** * 提交Spark Streaming Graph并执行 */ def start: Unit = this.startAwaitTermination() } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/ext/provider/HBaseBulkProvider.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.ext.provider import com.zto.fire._ import com.zto.fire.hbase.HBaseConnector import com.zto.fire.hbase.bean.HBaseBaseBean import com.zto.fire.spark.connector.HBaseBulkConnector import org.apache.hadoop.hbase.client.Scan import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, Encoders} import org.apache.spark.streaming.dstream.DStream import scala.reflect.ClassTag /** * 为扩展层提供HBase bulk api * * @author ChengLong * @since 2.0.0 * @create 2020-12-23 17:31 */ trait HBaseBulkProvider extends SparkProvider { /** * scan数据,并转为RDD * * @param tableName * HBase表名 * @param scan * scan对象 * @param clazz * 对应的返回值类型 * @return * clazz类型的rdd */ def hbaseBulkScanRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan, keyNum: Int = 1): RDD[T] = { HBaseBulkConnector.bulkScanRDD(tableName, clazz, scan, keyNum) } /** * scan数据,并转为RDD * * @param tableName * HBase表名 * @param startRow * 开始 * @param stopRow * 结束 * @param clazz * 对应的返回值类型 * @return * clazz类型的rdd */ def hbaseBulkScanRDD2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String, keyNum: Int = 1): RDD[T] = { HBaseBulkConnector.bulkScanRDD2(tableName, clazz, startRow, stopRow, keyNum) } /** * 使用bulk方式scan数据,并转为DataFrame * * @param tableName * HBase表名 * @param scan * scan对象 * @param clazz * 对应的返回值类型 * @return * clazz类型的rdd */ def hbaseBulkScanDF[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan, keyNum: Int = 1): DataFrame = { val rdd = HBaseBulkConnector.bulkScanRDD(tableName, clazz, scan, keyNum) this.spark.createDataFrame(rdd, clazz) } /** * 使用bulk方式scan数据,并转为DataFrame * * @param tableName * HBase表名 * @param startRow * 开始 * @param stopRow * 结束 * @param clazz * 对应的返回值类型 * @return * clazz类型的rdd */ def hbaseBulkScanDF2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String, keyNum: Int = 1): DataFrame = { this.hbaseBulkScanDF[T](tableName, clazz, HBaseConnector.buildScan(startRow, stopRow), keyNum) } /** * 使用bulk方式scan数据,并转为Dataset * * @param tableName * HBase表名 * @param scan * scan对象 * @param clazz * 对应的返回值类型 * @return * clazz类型的rdd */ def hbaseBulkScanDS[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan, keyNum: Int = 1): Dataset[T] = { val rdd = HBaseBulkConnector.bulkScanRDD(tableName, clazz, scan, keyNum) this.spark.createDataset(rdd)(Encoders.bean(clazz)) } /** * 使用bulk方式scan数据,并转为DataFrame * * @param tableName * HBase表名 * @param startRow * 开始 * @param stopRow * 结束 * @param clazz * 对应的返回值类型 * @return * clazz类型的rdd */ def hbaseBulkScanDS2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String, keyNum: Int = 1): Dataset[T] = { this.hbaseBulkScanDS[T](tableName, clazz, HBaseConnector.buildScan(startRow, stopRow), keyNum) } /** * 使用bulk方式批量插入数据 * * @param tableName * HBase表名 * 数据集合,继承自HBaseBaseBean */ def hbaseBulkPutRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, rdd: RDD[T], keyNum: Int = 1): Unit = { rdd.hbaseBulkPutRDD(tableName, keyNum) } /** hbaseInsertList * 批量写入,将自定义的JavaBean数据集批量并行写入 * 到HBase的指定表中。内部会将自定义JavaBean的相应 * 字段一一映射为Put对象,并完成一次写入 * * @param tableName * HBase表名 * @tparam T * 数据类型为HBaseBaseBean的子类 */ def hbaseBulkPutDF[T <: HBaseBaseBean[T] : ClassTag](tableName: String, dataFrame: DataFrame, clazz: Class[T], keyNum: Int = 1): Unit = { dataFrame.hbaseBulkPutDF[T](tableName, clazz, keyNum) } /** * 批量写入,将自定义的JavaBean数据集批量并行写入 * 到HBase的指定表中。内部会将自定义JavaBean的相应 * 字段一一映射为Put对象,并完成一次写入 * * @param tableName * HBase表名 * @param dataset * dataFrame实例,数类型需继承自HBaseBaseBean * @tparam T * 数据类型为HBaseBaseBean的子类 */ def hbaseBulkPutDS[T <: HBaseBaseBean[T] : ClassTag](tableName: String, dataset: Dataset[T], keyNum: Int = 1): Unit = { dataset.hbaseBulkPutDS[T](tableName, keyNum) } /** * DStrea数据实时写入 * * @param tableName * HBase表名 */ def hbaseBulkPutStream[T <: HBaseBaseBean[T] : ClassTag](tableName: String, dstream: DStream[T], keyNum: Int = 1): Unit = { dstream.hbaseBulkPutStream[T](tableName, keyNum) } /** * 根据RDD[String]批量删除 * * @param tableName * HBase表名 * @param rowKeyRDD * 装有rowKey的rdd集合 */ def hbaseBulkDeleteRDD(tableName: String, rowKeyRDD: RDD[String], keyNum: Int = 1): Unit = { rowKeyRDD.hbaseBulkDeleteRDD(tableName, keyNum) } /** * 根据Dataset[String]批量删除,Dataset是rowkey的集合 * 类型为String * * @param tableName * HBase表名 */ def hbaseBulkDeleteDS(tableName: String, dataSet: Dataset[String], keyNum: Int = 1): Unit = { dataSet.hbaseBulkDeleteDS(tableName, keyNum) } /** * 根据rowKey集合批量获取数据,并映射为自定义的JavaBean类型 * 内部实现是将rowkey集合转为RDD[String],推荐在数据量较大 * 时使用。数据量较小请优先使用HBaseOper * * @param tableName * HBase表名 * @param clazz * 具体类型 * @param seq * rowKey集合 * @tparam E * 自定义JavaBean类型,必须继承自HBaseBaseBean * @return * 自定义JavaBean的对象结果集 */ def hbaseBulkGetSeq[E <: HBaseBaseBean[E] : ClassTag](tableName: String, seq: Seq[String], clazz: Class[E], keyNum: Int = 1): RDD[E] = { HBaseBulkConnector.bulkGetSeq[E](tableName, seq, clazz, keyNum) } /** * 根据rowKey集合批量获取数据 * * @param tableName * HBase表名 * @param clazz * 获取后的记录转换为目标类型 * @return * 结果集 */ def hbaseBulkGetRDD[E <: HBaseBaseBean[E] : ClassTag](tableName: String, rowKeyRDD: RDD[String], clazz: Class[E], keyNum: Int = 1): RDD[E] = { rowKeyRDD.hbaseBulkGetRDD[E](tableName, clazz, keyNum) } /** * 根据rowKey集合批量获取数据,并映射为自定义的JavaBean类型 * * @param tableName * HBase表名 * @param clazz * 获取后的记录转换为目标类型(自定义的JavaBean类型) * @tparam E * 自定义JavaBean类型,必须继承自HBaseBaseBean * @return * 自定义JavaBean的对象结果集 */ def hbaseBulkGetDF[E <: HBaseBaseBean[E] : ClassTag](tableName: String, rowKeyRDD: RDD[String], clazz: Class[E], keyNum: Int = 1): DataFrame = { rowKeyRDD.hbaseBulkGetDF[E](tableName, clazz, keyNum) } /** * 根据rowKey集合批量获取数据,并映射为自定义的JavaBean类型 * * @param tableName * HBase表名 * @param clazz * 获取后的记录转换为目标类型(自定义的JavaBean类型) * @tparam E * 自定义JavaBean类型,必须继承自HBaseBaseBean * @return * 自定义JavaBean的对象结果集 */ def hbaseBulkGetDS[E <: HBaseBaseBean[E] : ClassTag](tableName: String, rowKeyRDD: RDD[String], clazz: Class[E], keyNum: Int = 1): Dataset[E] = { rowKeyRDD.hbaseBulkGetDS[E](tableName, clazz, keyNum) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/ext/provider/HBaseConnectorProvider.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.ext.provider import com.zto.fire._ import com.zto.fire.hbase.HBaseConnector import com.zto.fire.hbase.bean.HBaseBaseBean import com.zto.fire.spark.connector.HBaseSparkBridge import org.apache.hadoop.hbase.client.{Get, Scan} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset} import scala.reflect.ClassTag /** * 为扩展层提供HBaseConnector相关API * * @author ChengLong * @since 1.0.0 * @create 2020-12-23 17:39 */ trait HBaseConnectorProvider extends SparkProvider { /** * Scan指定HBase表的数据,并映射为DataFrame * * @param tableName * HBase表名 * @param scan * scan对象 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseScanDF[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan, keyNum: Int = 1): DataFrame = { HBaseSparkBridge(keyNum = keyNum).hbaseScanDF(tableName, clazz, scan) } /** * Scan指定HBase表的数据,并映射为DataFrame * * @param tableName * HBase表名 * @param startRow * 开始主键 * @param stopRow 结束主键 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseScanDF2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String, keyNum: Int = 1): DataFrame = { HBaseSparkBridge(keyNum = keyNum).hbaseScanDF2(tableName, clazz, startRow, stopRow) } /** * Scan指定HBase表的数据,并映射为Dataset * * @param tableName * HBase表名 * @param scan * scan对象 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseScanDS[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan, keyNum: Int = 1): Dataset[T] = { HBaseSparkBridge(keyNum = keyNum).hbaseScanDS[T](tableName, clazz, scan) } /** * Scan指定HBase表的数据,并映射为Dataset * * @param tableName * HBase表名 * @param startRow * 开始主键 * @param stopRow 结束主键 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseScanDS2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String, keyNum: Int = 1): Dataset[T] = { HBaseSparkBridge(keyNum = keyNum).hbaseScanDS2[T](tableName, clazz, startRow, stopRow) } /** * 使用hbase java api方式插入一个集合的数据到hbase表中 * * @param tableName * hbase表名 * @param seq * HBaseBaseBean的子类集合 */ def hbasePutList[T <: HBaseBaseBean[T] : ClassTag](tableName: String, seq: Seq[T], keyNum: Int = 1): Unit = { HBaseSparkBridge(keyNum = keyNum).hbasePutList[T](tableName, seq) } /** * 使用Java API的方式将RDD中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 */ def hbasePutRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, rdd: RDD[T], keyNum: Int = 1): Unit = { rdd.hbasePutRDD[T](tableName, keyNum) } /** * 使用Java API的方式将DataFrame中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 * @param df * DataFrame * @param clazz * JavaBean类型,为HBaseBaseBean的子类 */ def hbasePutDF[E <: HBaseBaseBean[E] : ClassTag](tableName: String, df: DataFrame, clazz: Class[E], keyNum: Int = 1): Unit = { df.hbasePutDF(tableName, clazz, keyNum) } /** * 使用Java API的方式将Dataset中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 * @param clazz * JavaBean类型,为HBaseBaseBean的子类 */ def hbasePutDS[E <: HBaseBaseBean[E] : ClassTag](tableName: String, dataset: Dataset[E], clazz: Class[E], keyNum: Int = 1): Unit = { dataset.hbasePutDS[E](tableName, clazz, keyNum) } /** * Scan指定HBase表的数据,并映射为RDD[(ImmutableBytesWritable, Result)] * * @param tableName * HBase表名 * @param scan * HBase scan对象 * @return */ def hbaseScanRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan, keyNum: Int = 1): RDD[T] = { HBaseSparkBridge(keyNum = keyNum).hbaseScanRDD(tableName, clazz, scan) } /** * Scan指定HBase表的数据,并映射为RDD[(ImmutableBytesWritable, Result)] * * @param tableName * HBase表名 * @param startRow * rowKey开始位置 * @param stopRow * rowKey结束位置 * 目标类型 * @return */ def hbaseScanRDD2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String, keyNum: Int = 1): RDD[T] = { HBaseSparkBridge(keyNum = keyNum).hbaseScanRDD(tableName, clazz, HBaseConnector.buildScan(startRow, stopRow)) } /** * Scan指定HBase表的数据,并映射为List * * @param tableName * HBase表名 * @param scan * hbase scan对象 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseScanList[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan, keyNum: Int = 1): Seq[T] = { HBaseSparkBridge(keyNum = keyNum).hbaseScanList[T](tableName, clazz, scan) } /** * Scan指定HBase表的数据,并映射为List * * @param tableName * HBase表名 * @param startRow * 开始主键 * @param stopRow 结束主键 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseScanList2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String, keyNum: Int = 1): Seq[T] = { HBaseSparkBridge(keyNum = keyNum).hbaseScanList2[T](tableName, clazz, startRow, stopRow) } /** * 通过RDD[String]批量获取对应的数据(可获取历史版本的记录) * * @param tableName * HBase表名 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseGetRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], rdd: RDD[String], keyNum: Int = 1): RDD[T] = { rdd.hbaseGetRDD(tableName, clazz, keyNum) } /** * 通过RDD[String]批量获取对应的数据(可获取历史版本的记录) * * @param tableName * HBase表名 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseGetDF[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], rdd: RDD[String], keyNum: Int = 1): DataFrame = { rdd.hbaseGetDF(tableName, clazz, keyNum) } /** * 通过RDD[String]批量获取对应的数据(可获取历史版本的记录) * * @param tableName * HBase表名 * @param clazz * 目标类型 * @tparam T * 目标类型 * @return */ def hbaseGetDS[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], rdd: RDD[String], keyNum: Int = 1): Dataset[T] = { rdd.hbaseGetDS[T](tableName, clazz, keyNum) } /** * 根据rowKey查询数据,并转为List[T] * * @param tableName * hbase表名 * @param seq * rowKey集合 * @param clazz * 目标类型 * @return * List[T] */ def hbaseGetList[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], seq: Seq[Get], keyNum: Int = 1): Seq[T] = { HBaseSparkBridge(keyNum = keyNum).hbaseGetList[T](tableName, clazz, seq) } /** * 根据rowKey查询数据,并转为List[T] * * @param tableName * hbase表名 * @param seq * rowKey集合 * @param clazz * 目标类型 * @return * List[T] */ def hbaseGetList2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], seq: Seq[String], keyNum: Int = 1): Seq[T] = { HBaseSparkBridge(keyNum = keyNum).hbaseGetList2[T](tableName, clazz, seq) } /** * 根据rowKey集合批量删除记录 * * @param tableName * hbase表名 * @param rowKeys * rowKey集合 */ def hbaseDeleteList(tableName: String, rowKeys: Seq[String], keyNum: Int = 1): Unit = { HBaseSparkBridge(keyNum = keyNum).hbaseDeleteList(tableName, rowKeys) } /** * 根据RDD[RowKey]批量删除记录 * * @param tableName * rowKey集合 * @param rowKeyRDD * rowKey的rdd集合 */ def hbaseDeleteRDD(tableName: String, rowKeyRDD: RDD[String], keyNum: Int = 1): Unit = { rowKeyRDD.hbaseDeleteRDD(tableName, keyNum) } /** * 根据Dataset[RowKey]批量删除记录 * * @param tableName * rowKey集合 */ def hbaseDeleteDS(tableName: String, dataset: Dataset[String], keyNum: Int = 1): Unit = { dataset.hbaseDeleteDS(tableName, keyNum) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/ext/provider/HBaseHadoopProvider.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.ext.provider import com.zto.fire._ import com.zto.fire.hbase.bean.HBaseBaseBean import com.zto.fire.spark.connector.HBaseSparkBridge import org.apache.hadoop.hbase.client.{Result, Scan} import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, Row} import scala.reflect.ClassTag /** * 为扩展层提供spark方式的HBase操作API * * @author ChengLong * @since 2.0.0 * @create 2020-12-23 17:41 */ trait HBaseHadoopProvider extends SparkProvider { /** * 使用Spark API的方式将RDD中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 */ def hbaseHadoopPutRDD[E <: HBaseBaseBean[E] : ClassTag](tableName: String, rdd: RDD[E], keyNum: Int = 1): Unit = { rdd.hbaseHadoopPutRDD(tableName, keyNum) } /** * 使用spark API的方式将DataFrame中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 * @param clazz * JavaBean类型,为HBaseBaseBean的子类 */ def hbaseHadoopPutDF[E <: HBaseBaseBean[E] : ClassTag](tableName: String, dataFrame: DataFrame, clazz: Class[E], keyNum: Int = 1): Unit = { dataFrame.hbaseHadoopPutDF(tableName, clazz, keyNum) } /** * 使用spark API的方式将DataFrame中的数据分多个批次插入到HBase中 * * @param tableName * HBase表名 * @param dataset * JavaBean类型,待插入到hbase的数据集 */ def hbaseHadoopPutDS[E <: HBaseBaseBean[E] : ClassTag](tableName: String, dataset: Dataset[E], keyNum: Int = 1): Unit = { dataset.hbaseHadoopPutDS[E](tableName, keyNum) } /** * 以spark 方式批量将DataFrame数据写入到hbase中 * * @param tableName * hbase表名 * @tparam T * JavaBean类型 */ def hbaseHadoopPutDFRow[T <: HBaseBaseBean[T] : ClassTag](tableName: String, dataFrame: DataFrame, buildRowKey: (Row) => String, keyNum: Int = 1): Unit = { dataFrame.hbaseHadoopPutDFRow[T](tableName, buildRowKey, keyNum) } /** * Scan指定HBase表的数据,并映射为RDD[(ImmutableBytesWritable, Result)] * * @param tableName * HBase表名 * @param scan * scan对象 * 目标类型 * @return */ def hbaseHadoopScanRS(tableName: String, scan: Scan, keyNum: Int = 1): RDD[(ImmutableBytesWritable, Result)] = { HBaseSparkBridge(keyNum = keyNum).hbaseHadoopScanRS(tableName, scan) } /** * Scan指定HBase表的数据,并映射为RDD[(ImmutableBytesWritable, Result)] * * @param tableName * HBase表名 * @param startRow * rowKey开始位置 * @param stopRow * rowKey结束位置 * 目标类型 * @return */ def hbaseHadoopScanRS2(tableName: String, startRow: String, stopRow: String, keyNum: Int = 1): RDD[(ImmutableBytesWritable, Result)] = { HBaseSparkBridge(keyNum = keyNum).hbaseHadoopScanRS2(tableName, startRow, stopRow) } /** * Scan指定HBase表的数据,并映射为RDD[(T] * * @param tableName * HBase表名 * @param scan * scan对象 * 目标类型 * @return */ def hbaseHadoopScanRDD[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan, keyNum: Int = 1): RDD[T] = { HBaseSparkBridge(keyNum = keyNum).hbaseHadoopScanRDD[T](tableName, clazz, scan) } /** * Scan指定HBase表的数据,并映射为RDD[T] * * @param tableName * HBase表名 * @param startRow * rowKey开始位置 * @param stopRow * rowKey结束位置 * 目标类型 * @return */ def hbaseHadoopScanRDD2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String, keyNum: Int = 1): RDD[T] = { HBaseSparkBridge(keyNum = keyNum).hbaseHadoopScanRDD2[T](tableName, clazz, startRow, stopRow) } /** * Scan指定HBase表的数据,并映射为RDD[(T] * * @param tableName * HBase表名 * @param scan * scan对象 * 目标类型 * @return */ def hbaseHadoopScanDF[T <: HBaseBaseBean[T] : ClassTag](tableName: String, scan: Scan, clazz: Class[T], keyNum: Int = 1): DataFrame = { HBaseSparkBridge(keyNum = keyNum).hbaseHadoopScanDF[T](tableName, clazz, scan) } /** * Scan指定HBase表的数据,并映射为RDD[(ImmutableBytesWritable, Result)] * * @param tableName * HBase表名 * @param startRow * rowKey开始位置 * @param stopRow * rowKey结束位置 * 目标类型 * @return */ def hbaseHadoopScanDF2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String, keyNum: Int = 1): DataFrame = { HBaseSparkBridge(keyNum = keyNum).hbaseHadoopScanDF2[T](tableName, clazz, startRow, stopRow) } /** * Scan指定HBase表的数据,并映射为RDD[(T] * * @param tableName * HBase表名 * @param scan * scan对象 * 目标类型 * @return */ def hbaseHadoopScanDS[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], scan: Scan, keyNum: Int = 1): Dataset[T] = { HBaseSparkBridge(keyNum = keyNum).hbaseHadoopScanDS[T](tableName, clazz, scan) } /** * Scan指定HBase表的数据,并映射为RDD[(ImmutableBytesWritable, Result)] * * @param tableName * HBase表名 * @param startRow * rowKey开始位置 * @param stopRow * rowKey结束位置 * 目标类型 * @return */ def hbaseHadoopScanDS2[T <: HBaseBaseBean[T] : ClassTag](tableName: String, clazz: Class[T], startRow: String, stopRow: String, keyNum: Int = 1): Dataset[T] = { HBaseSparkBridge(keyNum = keyNum).hbaseHadoopScanDS2[T](tableName, clazz, startRow, stopRow) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/ext/provider/JdbcSparkProvider.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.ext.provider import com.zto.fire._ import com.zto.fire.jdbc.JdbcConnector import com.zto.fire.jdbc.conf.FireJdbcConf import com.zto.fire.spark.util.SparkUtils import org.apache.commons.lang3.StringUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SaveMode} import org.apache.spark.storage.StorageLevel import java.util.Properties import scala.reflect.ClassTag /** * 为扩展层提供jdbc相关的api * * @author ChengLong * @since 2.0.0 * @create 2020-12-23 17:48 */ trait JdbcSparkProvider extends SparkProvider { /** * 执行查询操作,以RDD方式返回结果集 * * @param sql * 查询语句 * @param params * sql执行参数 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 * @return 查询结果集 */ def jdbcQueryRDD[T <: Object : ClassTag](sql: String, params: Seq[Any] = null, keyNum: Int = 1): RDD[Row] = { this.jdbcQueryDF(sql, params, keyNum).rdd } /** * 执行查询操作,以DataFrame方式返回结果集 * * @param sql * 查询语句 * @param params * JavaBean类型 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 * @return 查询结果集 */ def jdbcQueryDF[T <: Object : ClassTag](sql: String, params: Seq[Any] = null, keyNum: Int = 1): DataFrame = { JdbcConnector.executeQuery(sql, params, keyNum = keyNum, callback = rs => { SparkUtils.resultSet2DataFrame(rs, keyNum) }).persist(StorageLevel.fromString(FireJdbcConf.jdbcStorageLevel)) } /** * 将DataFrame数据保存到关系型数据库中 * * @param dataFrame * DataFrame数据集 * @param tableName * 关系型数据库表名 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 */ def jdbcTableSave(dataFrame: DataFrame, tableName: String, saveMode: SaveMode = SaveMode.Append, jdbcProps: Properties = null, keyNum: Int = 1): Unit = { dataFrame.jdbcTableSave(tableName, saveMode, jdbcProps, keyNum) } /** * 单线程加载一张关系型数据库表 * 注:仅限用于小的表,不支持条件查询 * * @param tableName * 关系型数据库表名 * @param jdbcProps * 调用者指定的数据库连接信息,如果为空,则默认读取配置文件 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 * @return * DataFrame */ def jdbcTableLoadAll(tableName: String, jdbcProps: Properties = null, keyNum: Int = 1): DataFrame = { this.spark.sqlContext.jdbcTableLoadAll(tableName, jdbcProps, keyNum) } /** * 指定load的条件,从关系型数据库中并行的load数据,并转为DataFrame * * @param tableName 数据库表名 * @param predicates * 并行load数据时,每一个分区load数据的where条件 * 比如:gmt_create >= '2019-06-20' AND gmt_create <= '2019-06-21' 和 gmt_create >= '2019-06-22' AND gmt_create <= '2019-06-23' * 那么将两个线程同步load,线程数与predicates中指定的参数个数保持一致 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 * @return * 查询结果集 */ def jdbcTableLoad(tableName: String, predicates: Array[String], jdbcProps: Properties = null, keyNum: Int = 1): DataFrame = { this.spark.sqlContext.jdbcTableLoad(tableName, predicates, jdbcProps, keyNum) } /** * 根据指定字段的范围load关系型数据库中的数据 * * @param tableName * 表名 * @param columnName * 表的分区字段 * @param lowerBound * 分区的下边界 * @param upperBound * 分区的上边界 * @param jdbcProps * jdbc连接信息,默认读取配置文件 * @param keyNum * 配置文件中数据源配置的数字后缀,用于应对多数据源的情况,如果仅一个数据源,可不填 * 比如需要操作另一个数据库,那么配置文件中key需携带相应的数字后缀:spark.db.jdbc.url2,那么此处方法调用传参为3,以此类推 * @return */ def jdbcTableLoadBound(tableName: String, columnName: String, lowerBound: Long, upperBound: Long, numPartitions: Int = 10, jdbcProps: Properties = null, keyNum: Int = 1): DataFrame = { this.spark.sqlContext.jdbcTableLoadBound(tableName, columnName, lowerBound, upperBound, keyNum, jdbcProps, keyNum) } /** * 将DataFrame中指定的列写入到jdbc中 * 调用者需自己保证DataFrame中的列类型与关系型数据库对应字段类型一致 * * @param dataFrame * 将要插入到关系型数据库中原始的数据集 * @param sql * 关系型数据库待执行的增删改sql * @param fields * 指定部分DataFrame列名作为参数,顺序要对应sql中问号占位符的顺序 * 若不指定字段,则默认传入当前DataFrame所有列,且列的顺序与sql中问号占位符顺序一致 * @param batch * 每个批次执行多少条 * @param keyNum * 对应配置文件中指定的数据源编号 */ def jdbcBatchUpdateDF(dataFrame: DataFrame, sql: String, fields: Seq[String] = null, batch: Int = FireJdbcConf.batchSize(), keyNum: Int = 1): Unit = { require(dataFrame != null && StringUtils.isNotBlank(sql), "执行jdbcBatchUpdateDF失败,dataFrame或sql为空") dataFrame.jdbcBatchUpdate(sql, fields, batch, keyNum) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/ext/provider/KafkaSparkProvider.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.ext.provider import com.zto.fire.common.conf.FireKafkaConf import com.zto.fire.common.util.{KafkaUtils, LogUtils} import com.zto.fire.spark.util.SparkUtils import com.zto.fire.{requireNonEmpty, retry, _} import org.apache.commons.lang3.StringUtils import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions.from_json import org.apache.spark.sql.{DataFrame, Dataset, Encoders} /** * 为扩展层提供Kafka相关的API * * @author ChengLong * @since 2.0.0 * @create 2020-12-23 17:43 */ trait KafkaSparkProvider extends SparkProvider { import spark.implicits._ /** * 消费kafka中的json数据,并解析成json字符串 * * @param extraOptions * 消费kafka额外的参数,如果有key同时出现在配置文件中和extraOptions中,将被extraOptions覆盖 * @param keyNum * 配置文件中key的数字后缀 * @return * 转换成json字符串后的Dataset */ def loadKafka(extraOptions: Map[String, String] = null, keyNum: Int = 1): Dataset[(String, String)] = { val extraOptionsMap = new scala.collection.mutable.HashMap[String, String] if (extraOptions != null && extraOptions.nonEmpty) extraOptionsMap ++= extraOptions val confGroupId = FireKafkaConf.kafkaGroupId(keyNum) val groupId = if (StringUtils.isNotBlank(confGroupId)) confGroupId else spark.sparkContext.appName extraOptionsMap += ("group.id" -> groupId) val finalBrokers = FireKafkaConf.kafkaBrokers(keyNum) if (StringUtils.isNotBlank(finalBrokers)) extraOptionsMap += ("kafka.bootstrap.servers" -> finalBrokers) require(extraOptionsMap.contains("kafka.bootstrap.servers"), s"kafka bootstrap.servers不能为空,请在配置文件中指定:spark.kafka.brokers.name$keyNum") val topics = FireKafkaConf.kafkaTopics() if (StringUtils.isNotBlank(topics)) extraOptionsMap += ("subscribe" -> topics) require(extraOptionsMap.contains("subscribe"), s"kafka topic不能为空,请在配置文件中指定:spark.kafka.topics$keyNum") // 以spark.kafka.conf.开头的配置优先级最高 val configMap = FireKafkaConf.kafkaConfMap(keyNum) extraOptionsMap ++= configMap LogUtils.logMap(this.logger, extraOptionsMap.toMap, s"Kafka client configuration. keyNum=$keyNum.") val kafkaReader = spark.readStream .format("kafka") .options(extraOptionsMap) .load() .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING) as value") .as[(String, String)] kafkaReader } /** * 消费kafka中的json数据,并按照指定的schema解析成目标类型 * * @param schemaClass * json对应的javabean类型 * @param extraOptions * 消费kafka额外的参数 * @param parseAll * 是否解析所有字段信息 * @param isMySQL * 是否为mysql解析的消息 * @param fieldNameUpper * 字段名称是否为大写 * @return * 转换成json字符串后的Dataset */ def loadKafkaParse(schemaClass: Class[_], extraOptions: Map[String, String] = null, parseAll: Boolean = false, isMySQL: Boolean = true, fieldNameUpper: Boolean = false, keyNum: Int = 1): DataFrame = { val kafkaDataset = this.loadKafka(extraOptions, keyNum) val schemaDataset = kafkaDataset.select(from_json($"value", SparkUtils.buildSchema2Kafka(schemaClass, parseAll, isMySQL, fieldNameUpper)).as("data")) if (parseAll) schemaDataset.select("data.*") else schemaDataset.select("data.after.*") } /** * 消费kafka中的json数据,并自动解析json数据,将解析后的数据注册到tableName所指定的临时表中 * * @param tableName * 解析后的数据存放的临时表名,默认名为kafka * @param extraOptions * 消费kafka额外的参数 * @return * 转换成json字符串后的Dataset */ def loadKafkaParseJson(tableName: String = "kafka", extraOptions: Map[String, String] = null, keyNum: Int = 1): DataFrame = { val msg = retry(5, 1000) { KafkaUtils.getMsg(FireKafkaConf.kafkaBrokers(keyNum), FireKafkaConf.kafkaTopics(keyNum), null) } requireNonEmpty(msg, s"获取样例消息失败!请重启任务尝试重新获取,并保证topic[${FireKafkaConf.kafkaTopics(keyNum)}]持续的有新消息。") val jsonDS = this.spark.createDataset(Seq(msg))(Encoders.STRING) val jsonDF = this.spark.read.json(jsonDS) val kafkaDataset = this.loadKafka(extraOptions, keyNum) val schemaDataset = kafkaDataset.select(from_json($"value", jsonDF.schema).as(tableName)).select(s"${tableName}.*") schemaDataset.createOrReplaceTempView(tableName) schemaDataset } /** * 解析DStream中每个rdd的json数据,并转为DataFrame类型 * * @param schema * 目标DataFrame类型的schema * @param isMySQL * 是否为mysql解析的消息 * @param fieldNameUpper * 字段名称是否为大写 * @param parseAll * 是否需要解析所有字段信息 * @return */ def kafkaJson2DFV(rdd: RDD[String], schema: Class[_], parseAll: Boolean = false, isMySQL: Boolean = true, fieldNameUpper: Boolean = false): DataFrame = { rdd.kafkaJson2DFV(schema, parseAll, isMySQL, fieldNameUpper) } /** * 解析DStream中每个rdd的json数据,并转为DataFrame类型 * * @param schema * 目标DataFrame类型的schema * @param isMySQL * 是否为mysql解析的消息 * @param fieldNameUpper * 字段名称是否为大写 * @param parseAll * 是否解析所有字段信息 * @return */ def kafkaJson2DF(rdd: RDD[ConsumerRecord[String, String]], schema: Class[_], parseAll: Boolean = false, isMySQL: Boolean = true, fieldNameUpper: Boolean = false): DataFrame = { rdd.kafkaJson2DF(schema, parseAll, isMySQL, fieldNameUpper) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/ext/provider/SparkProvider.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.ext.provider import com.zto.fire.core.ext.Provider import com.zto.fire.spark.util.SparkSingletonFactory /** * spark provider父接口 * * @author ChengLong * @since 2.0.0 * @create 2020-12-23 17:49 */ trait SparkProvider extends Provider { protected lazy val spark = SparkSingletonFactory.getSparkSession protected lazy val sc = spark.sparkContext } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/ext/provider/SqlProvider.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.ext.provider import com.zto.fire._ import com.zto.fire.common.conf.FireHiveConf import com.zto.fire.spark.conf.FireSparkConf import com.zto.fire.spark.udf.UDFs import com.zto.fire.spark.util.SparkSingletonFactory import org.apache.commons.lang3.StringUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, SaveMode, SparkSession} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.DStream /** * 为扩展层提供Spark SQL api * * @author ChengLong * @since 2.0.0 * @create 2020-12-23 17:35 */ trait SqlProvider extends SparkProvider { protected lazy val sqlContext = this.spark.sqlContext protected lazy val catalog = this.spark.catalog /** * 清理 RDD、DataFrame、Dataset、DStream、TableName 缓存 * 等同于unpersist * * @param any * RDD、DataFrame、Dataset、DStream、TableName */ def uncache(any: Any*): Unit = { if (any != null && any.nonEmpty) { any.foreach(elem => { if (elem != null) { if (elem.isInstanceOf[String]) { val tableName = elem.asInstanceOf[String] if (this.tableExists(tableName) && this.isCached(tableName)) { this.sqlContext.uncacheTables(tableName) } } else if (elem.isInstanceOf[Dataset[_]]) { elem.asInstanceOf[Dataset[_]].uncache } else if (elem.isInstanceOf[DataFrame]) { elem.asInstanceOf[DataFrame].uncache } else if (elem.isInstanceOf[RDD[_]]) { elem.asInstanceOf[RDD[_]].uncache } else if (elem.isInstanceOf[DStream[_]]) { elem.asInstanceOf[DStream[_]].uncache } } }) } } /** * 清理 RDD、DataFrame、Dataset、DStream、TableName 缓存 * 等同于uncache * * @param any * RDD、DataFrame、Dataset、DStream、TableName */ def unpersist(any: Any*): Unit = { this.uncache(any: _*) } /** * 清空所有缓存 */ def clearCache: Unit = this.catalog.clearCache() /** * 批量注册udf函数,包含系统内置的与用户自定义的 */ def registerUDF(): SparkSession = { UDFs.registerSysUDF(this.spark) this.spark } /** * 用于判断当前SparkSession下临时表或Hive表是否存在 * * @param tableName * 表名 * @return * true:存在 false:不存在 */ def tableExists(tableName: String): Boolean = { this.catalog.tableExists(tableName) } /** * 用于判断当前SparkSession下临时表或Hive表是否存在 * * @param tableName * 表名 * @return * true:存在 false:不存在 */ def tableExists(dbName: String, tableName: String): Boolean = { this.catalog.tableExists(dbName, tableName) } /** * 执行一段Hive QL语句,注册为临时表,持久化到hive中 * * @param sqlStr * sql语句 * @param tmpTableName * 临时表名 * @param saveMode * 持久化的模式,默认为Overwrite * @param cache * 默认缓存表 * @return * 生成的DataFrame */ def sqlForPersistent(sqlStr: String, tmpTableName: String, partitionName: String, saveMode: SaveMode = SaveMode.valueOf(FireSparkConf.saveMode), cache: Boolean = true): DataFrame = { this.sqlContext.sqlForPersistent(sqlStr, tmpTableName, partitionName, saveMode, cache) } /** * 执行一段Hive QL语句,注册为临时表,并cache * * @param sqlStr * SQL语句 * @param tmpTableName * 临时表名 * @return * 生成的DataFrame */ def sqlForCache(sqlStr: String, tmpTableName: String): DataFrame = { this.sqlContext.sqlForCache(sqlStr, tmpTableName) } /** * 执行一段Hive QL语句,注册为临时表 * * @param sqlStr * SQL语句 * @param tmpTableName * 临时表名 * @return * 生成的DataFrame */ def sqlNoCache(sqlStr: String, tmpTableName: String): DataFrame = { this.sqlContext.sqlNoCache(sqlStr, tmpTableName) } /** * 批量缓存多张表 * * @param tables * 多个表名 */ def cacheTables(tables: String*)(implicit storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER): Unit = { tables.filter(noEmpty(_)).foreach(tableName => this.cacheTable(tableName, storageLevel)) } /** * 缓存指定的表 * * @param table * 表名 * @param storageLevel * 缓存级别 */ def cacheTable(table: String, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER): Unit = { this.catalog.cacheTable(table, storageLevel) } /** * 批量重新缓存多张表,已经缓存的会被uncache后再cache * * @param tables * 待缓存的多个表名 */ def recacheTables(tables: String*): Unit = { tables.filter(noEmpty(_)).foreach(table => { if (this.isCached(table)) this.uncache(table) this.cacheTables(table) }) } /** * 判断表是否被缓存 * * @param tableName * 表名 * @return */ def isCached(tableName: String): Boolean = { this.catalog.isCached(tableName) } /** * 判断表是否未被缓存 * * @param tableName * 表名 * @return */ def isNotCached(tableName: String): Boolean = !this.isCached(tableName) /** * refresh给定的表 * * @param tables * 表名 */ def refreshTables(tables: String*): Unit = { tables.filter(noEmpty(_)).foreach(table => this.catalog.refreshTable(table)) } /** * 缓存或刷新给定的表 * 1. 当表未被cache时会首先进行cache * 2. 当表已被cache,再次调用会进行refresh操作 * * @param tables * 待cache或refresh的表名集合 */ def cacheOrRefreshTables(tables: String*)(implicit storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER): Unit = { tables.filter(noEmpty(_)).foreach(table => { if (this.isNotCached(table)) this.cacheTable(table, storageLevel) else this.refreshTables(table) }) } /** * 删除指定的hive表 * * @param tableNames * 多个表名 */ def dropHiveTable(tableNames: String*): Unit = { this.sqlContext.dropHiveTable(tableNames: _*) } /** * 为指定表添加分区 * * @param tableName * 表名 * @param partitions * 分区 */ def addPartitions(tableName: String, partitions: String*): Unit = { this.sqlContext.addPartitions(tableName, partitions: _*) } /** * 为指定表添加分区 * * @param tableName * 表名 * @param partition * 分区 * @param partitionName * 分区字段名称,默认ds */ def addPartition(tableName: String, partition: String, partitionName: String = FireHiveConf.partitionName): Unit = { this.sqlContext.addPartition(tableName, partition, partitionName) } /** * 为指定表删除分区 * * @param tableName * 表名 * @param partition * 分区 */ def dropPartition(tableName: String, partition: String, partitionName: String = FireHiveConf.partitionName): Unit = { this.sqlContext.dropPartition(tableName, partition, partitionName) } /** * 为指定表删除多个分区 * * @param tableName * 表名 * @param partitions * 分区 */ def dropPartitions(tableName: String, partitions: String*): Unit = { this.sqlContext.dropPartitions(tableName, partitions: _*) } /** * 根据给定的表创建新表 * * @param srcTableName * 源表 * @param destTableName * 目标表 */ def createTableAsSelect(srcTableName: String, destTableName: String): Unit = { this.sqlContext.createTableAsSelect(srcTableName, destTableName) } /** * 根据一张表创建另一张表 * * @param tableName * 表名 * @param destTableName * 目标表名 */ def createTableLike(tableName: String, destTableName: String): Unit = { this.sqlContext.createTableLike(tableName, destTableName) } /** * 根据给定的表创建新表 * * @param srcTableName * 来源表 * @param destTableName * 目标表 * @param cols * 多个列,逗号分隔 */ def createTableAsSelectFields(srcTableName: String, destTableName: String, cols: String): Unit = { this.sqlContext.createTableAsSelectFields(srcTableName, destTableName, cols) } /** * 将数据插入到指定表的分区中 * * @param srcTableName * 来源表 * @param destTableName * 目标表 * @param ds * 分区名 * @param cols * 多个列,逗号分隔 */ def insertIntoPartition(srcTableName: String, destTableName: String, ds: String, cols: String, partitionName: String = FireHiveConf.partitionName): Unit = { this.sqlContext.insertIntoPartition(srcTableName, destTableName, ds, cols, partitionName) } /** * 将sql执行结果插入到目标表指定分区中 * * @param destTableName * 目标表名 * @param ds * 分区名 * @param querySQL * 查询语句 */ def insertIntoPartitionAsSelect(destTableName: String, ds: String, querySQL: String, partitionName: String = FireHiveConf.partitionName, overwrite: Boolean = false): Unit = { this.sqlContext.insertIntoPartitionAsSelect(destTableName, ds, querySQL, partitionName, overwrite) } /** * 将sql执行结果插入到目标表指定分区中 * * @param destTableName * 目标表名 * @param querySQL * 查询sql语句 */ def insertIntoDymPartitionAsSelect(destTableName: String, querySQL: String, partitionName: String = FireHiveConf.partitionName): Unit = { this.sqlContext.insertIntoDymPartitionAsSelect(destTableName, querySQL, partitionName) } /** * 修改表名 * * @param oldTableName * 表名称 * @param newTableName * 新的表名 */ def rename(oldTableName: String, newTableName: String): Unit = { this.sqlContext.rename(oldTableName, newTableName) } /** * 将表从一个db移动到另一个db中 * * @param tableName * 表名 * @param oldDB * 老库名称 * @param newDB * 新库名称 */ def moveDB(tableName: String, oldDB: String, newDB: String): Unit = { this.sqlContext.moveDB(tableName, oldDB, newDB) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/listener/FireSparkListener.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.listener import com.zto.fire.common.anno.Scheduled import com.zto.fire.common.enu.JobType import com.zto.fire.common.exception.FireSparkException import com.zto.fire.common.util.{ExceptionBus, Logging} import com.zto.fire.spark.BaseSpark import com.zto.fire.spark.acc.AccumulatorManager import com.zto.fire.spark.conf.FireSparkConf import com.zto.fire.spark.sync.SyncSparkEngine import org.apache.spark.SparkException import org.apache.spark.scheduler._ import java.util.concurrent.atomic.{AtomicBoolean, AtomicLong} /** * Spark事件监听器桥 * Created by ChengLong on 2018-05-19. */ private[fire] class FireSparkListener(baseSpark: BaseSpark) extends SparkListener with Logging { private[this] val module = "listener" private[this] val needRegister = new AtomicBoolean(false) // 用于统计stage失败的次数 private[this] lazy val stageFailedCount = new AtomicLong(0) /** * 当SparkContext启动时触发 */ override def onApplicationStart(applicationStart: SparkListenerApplicationStart): Unit = { this.logger.info(s"Spark 初始化完成.") this.baseSpark.onApplicationStart(applicationStart) } /** * fire 框架退出 */ private[this] def exit: Unit = { try { this.baseSpark.after() } finally { this.baseSpark.shutdown(inListener = true) } } /** * 当Spark运行结束时执行 */ override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = { this.exit super.onApplicationEnd(applicationEnd) } /** * 当executor metrics更新时触发 */ override def onExecutorMetricsUpdate(executorMetricsUpdate: SparkListenerExecutorMetricsUpdate): Unit = this.baseSpark.onExecutorMetricsUpdate(executorMetricsUpdate) /** * 当添加新的executor时,重新初始化内置的累加器 */ override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = { this.baseSpark.onExecutorAdded(executorAdded) if (this.baseSpark.jobType != JobType.SPARK_CORE) this.needRegister.compareAndSet(false, true) this.logger.debug(s"executor[${executorAdded.executorId}] added. host: [${executorAdded.executorInfo.executorHost}].", this.module) } /** * 当移除已有的executor时,executor数递减 */ override def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved): Unit = { this.baseSpark.onExecutorRemoved(executorRemoved) this.logger.debug(s"executor[${executorRemoved.executorId}] removed. reason: [${executorRemoved.reason}].", this.module) } /** * 当环境信息更新时触发 */ override def onEnvironmentUpdate(environmentUpdate: SparkListenerEnvironmentUpdate): Unit = this.baseSpark.onEnvironmentUpdate(environmentUpdate) /** * 当BlockManager添加时触发 */ override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded): Unit = this.baseSpark.onBlockManagerAdded(blockManagerAdded) /** * 当BlockManager移除时触发 */ override def onBlockManagerRemoved(blockManagerRemoved: SparkListenerBlockManagerRemoved): Unit = this.baseSpark.onBlockManagerRemoved(blockManagerRemoved) /** * 当block更新时触发 */ override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = this.baseSpark.onBlockUpdated(blockUpdated) /** * 当job开始执行时触发 */ override def onJobStart(jobStart: SparkListenerJobStart): Unit = this.baseSpark.onJobStart(jobStart) /** * 当job执行完成时触发 */ override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = { this.baseSpark.onJobEnd(jobEnd) if (jobEnd != null && jobEnd.jobResult == JobSucceeded) { AccumulatorManager.addMultiTimer(module, "onJobEnd", "onJobEnd", "", "INFO", "", 1) } else { AccumulatorManager.addMultiTimer(module, "onJobEnd", "onJobEnd", "", "ERROR", "", 1) this.logger.error(s"job failed.", this.module) } } /** * 当stage提交以后触发 */ override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = this.baseSpark.onStageSubmitted(stageSubmitted) /** * 当stage执行完成以后触发 */ override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = { this.baseSpark.onStageCompleted(stageCompleted) if (stageCompleted != null && stageCompleted.stageInfo.failureReason.isEmpty) { AccumulatorManager.addMultiTimer(module, "onStageCompleted", "onStageCompleted", "", "INFO", "", 1) } else { AccumulatorManager.addMultiTimer(module, "onStageCompleted", "onStageCompleted", "", "ERROR", "", 1) this.logger.error(s"stage failed. reason: " + stageCompleted.stageInfo.failureReason, this.module) AccumulatorManager.addLog(stageCompleted.stageInfo.failureReason.getOrElse("")) // 异常信息统一投递到Fire异常总线 ExceptionBus.post(new FireSparkException(stageCompleted.stageInfo.failureReason.get)) // spark.fire.stage.maxFailures参数用于控制stage允许的最大失败次数,小于等于零表示不开启,默认-1 // 当配置为2时表示最多允许失败2个stage,当第三个stage失败时SparkSession退出 if (this.stageFailedCount.addAndGet(1) > FireSparkConf.stageMaxFailures && FireSparkConf.stageMaxFailures > 0) this.exit } } /** * 当task开始执行时触发 */ override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = this.baseSpark.onTaskStart(taskStart) /** * 当从task获取计算结果时触发 */ override def onTaskGettingResult(taskGettingResult: SparkListenerTaskGettingResult): Unit = this.baseSpark.onTaskGettingResult(taskGettingResult) /** * 当task执行完成以后触发 */ override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { this.baseSpark.onTaskEnd(taskEnd) if (taskEnd != null && taskEnd.reason != null && "Success".equalsIgnoreCase(taskEnd.reason.toString)) { AccumulatorManager.addMultiTimer(module, "onTaskEnd", "onTaskEnd", "", "INFO", "", 1) } else { AccumulatorManager.addMultiTimer(module, "onTaskEnd", "onTaskEnd", "", "ERROR", "", 1) this.logger.error(s"task failed.", this.module) } } /** * 当取消缓存RDD时触发 */ override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = this.baseSpark.onUnpersistRDD(unpersistRDD) /** * 用于注册内置累加器,每隔1分钟执行一次,延迟1分钟执行,默认执行10次 */ @Scheduled(fixedInterval = 60 * 1000, initialDelay = 60 * 1000, concurrent = false, repeatCount = 10) private[fire] def registerAcc: Unit = { if (this.needRegister.compareAndSet(true, false)) { AccumulatorManager.registerAccumulators(this.baseSpark.sc) SyncSparkEngine.syncDynamicConf(this.baseSpark.sc, this.baseSpark._conf) this.logger.info(s"完成系统累加器注册.", this.module) } } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/listener/FireStreamingQueryListener.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.listener import org.apache.spark.sql.streaming.StreamingQueryListener /** * structured streaming事件监听器 * * @author ChengLong 2019年12月24日 16:26:33 * @since 0.4.1 */ private[fire] class FireStreamingQueryListener extends StreamingQueryListener { @volatile protected var latestBatchId = -1L override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = { // onQueryStarted } override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = { this.latestBatchId = event.progress.batchId } override def onQueryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = { // onQueryTerminated } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/plugin/SparkArthasLauncher.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.plugin import com.zto.fire.common.util.OSUtils import com.zto.fire.core.plugin.{ArthasLauncher, ArthasManager} import com.zto.fire.predef._ import com.zto.fire.spark.sync.DistributeSyncManager import com.zto.fire.spark.util.SparkUtils /** * Spark Arthas分布式启动器 * * @author ChengLong 2021-11-3 15:38:20 * @since 2.2.0 */ private[fire] class SparkArthasLauncher extends ArthasLauncher { /** * 热启动Arthas * * @param isDistribute * 是否在每个container端启动arthas * @param ip * 仅在某些主机上启动 */ override def hotStart(isDistribute: Boolean, ip: String): Unit = { ArthasManager.startArthas(SparkUtils.getExecutorId) if (isDistribute) { DistributeSyncManager.sync({ if (isEmpty(ip) || ip.contains(OSUtils.getIp)) ArthasManager.startArthas(s"container_${SparkUtils.getExecutorId}") }) } } /** * 分布式热关闭Arthas相关服务 * * @param isDistribute * 是否在每个container端停止arthas * @param ip * 仅在某些主机上启动 */ override def hotStop(isDistribute: Boolean, ip: String): Unit = { ArthasManager.stopArthas if (isDistribute) { DistributeSyncManager.sync({ if (isEmpty(ip) || ip.contains(OSUtils.getIp)) ArthasManager.stopArthas }) } } /** * 分布式热重启rthas相关服务 * * @param isDistribute * 是否在每个container端停止arthas * @param ip * 仅在某些主机上启动 */ override def hotRestart(isDistribute: Boolean, ip: String): Unit = { ArthasManager.restartArthas(SparkUtils.getExecutorId) if (isDistribute) { DistributeSyncManager.sync({ if (isEmpty(ip) || ip.contains(OSUtils.getIp)) ArthasManager.restartArthas(SparkUtils.getExecutorId) }) } } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/rest/SparkSystemRestful.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.rest import com.google.common.collect.Table import com.zto.fire.common.anno.Rest import com.zto.fire.common.bean.rest.ResultMsg import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.enu.{ErrorCode, RequestMethod} import com.zto.fire.common.util._ import com.zto.fire.core.rest.{RestCase, SystemRestful} import com.zto.fire.spark.{BaseSpark, bean} import org.apache.commons.lang3.StringUtils import spark._ import com.zto.fire._ import com.zto.fire.core.bean.ArthasParam import com.zto.fire.spark.bean.{ColumnMeta, FunctionMeta, SparkInfo} import com.zto.fire.spark.plugin.SparkArthasLauncher import com.zto.fire.spark.sync.SyncSparkEngine import java.util /** * 系统预定义的restful服务,为Spark计算引擎提供接口服务 * * @author ChengLong 2019-3-16 10:16:38 */ private[fire] class SparkSystemRestful(val baseSpark: BaseSpark) extends SystemRestful(baseSpark) { private var sparkInfoBean: SparkInfo = _ /** * 注册Spark引擎接口 */ override def register: Unit = { this.baseSpark.restfulRegister .addRest(RestCase(RequestMethod.DELETE.toString, s"/system/kill", kill)) .addRest(RestCase(RequestMethod.DELETE.toString, s"/system/cancelJob", cancelJob)) .addRest(RestCase(RequestMethod.DELETE.toString, s"/system/cancelStage", cancelStage)) .addRest(RestCase(RequestMethod.POST.toString, s"/system/sql", sql)) .addRest(RestCase(RequestMethod.GET.toString, s"/system/sparkInfo", sparkInfo)) .addRest(RestCase(RequestMethod.GET.toString, s"/system/counter", counter)) .addRest(RestCase(RequestMethod.GET.toString, s"/system/multiCounter", multiCounter)) .addRest(RestCase(RequestMethod.POST.toString, s"/system/multiTimer", multiTimer)) .addRest(RestCase(RequestMethod.POST.toString, s"/system/log", log)) .addRest(RestCase(RequestMethod.POST.toString, s"/system/env", env)) .addRest(RestCase(RequestMethod.GET.toString, s"/system/listDatabases", listDatabases)) .addRest(RestCase(RequestMethod.POST.toString, s"/system/listTables", listTables)) .addRest(RestCase(RequestMethod.POST.toString, s"/system/listColumns", listColumns)) .addRest(RestCase(RequestMethod.POST.toString, s"/system/listFunctions", listFunctions)) .addRest(RestCase(RequestMethod.POST.toString, s"/system/setConf", setConf)) .addRest(RestCase(RequestMethod.GET.toString, s"/system/datasource", datasource)) .addRest(RestCase(RequestMethod.GET.toString, s"/system/lineage", lineage)) .addRest(RestCase(RequestMethod.POST.toString, s"/system/arthas", arthas)) .addRest(RestCase(RequestMethod.GET.toString, s"/system/exception", exception)) } /** * 用于更新配置信息 */ @Rest("/system/setConf") def setConf(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/setConf") this.logger.info(s"请求fire更新配置信息:$json") val confMap = JSONUtils.parseObject[java.util.HashMap[String, String]](json) if (ValueUtils.noEmpty(confMap)) { PropUtils.setProperties(confMap) this.baseSpark._conf.setAll(PropUtils.settings) SyncSparkEngine.syncDynamicConf(this.baseSpark.sc, this.baseSpark._conf) } ResultMsg.buildSuccess("配置信息已更新", ErrorCode.SUCCESS.toString) } catch { case e: Exception => { this.logger.error(s"[setConf] 设置配置信息失败:json=$json", e) ResultMsg.buildError("设置配置信息失败", ErrorCode.ERROR) } } } /** * 根据函数信息 */ @Rest("/system/listFunctions") def listFunctions(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/listFunctions") // 参数合法性检查 val dbName = JSONUtils.getValue(json, "dbName", "") // 获取已注册的函数 val funList = new util.LinkedList[FunctionMeta]() if (StringUtils.isNotBlank(dbName)) { this.baseSpark.catalog.listFunctions(dbName).collect().foreach(fun => { funList.add(new FunctionMeta(fun.description, fun.database, fun.name, fun.className, fun.isTemporary)) }) } else { this.baseSpark.catalog.listFunctions().collect().foreach(fun => { funList.add(new FunctionMeta(fun.description, fun.database, fun.name, fun.className, fun.isTemporary)) }) } this.logger.info(s"[listFunctions] 获取[$dbName]函数信息成功:json=$json") ResultMsg.buildSuccess(funList, s"获取[$dbName]函数信息成功") } catch { case e: Exception => { this.logger.error(s"[log] 获取函数信息失败:json=$json", e) ResultMsg.buildError("获取函数信息失败", ErrorCode.ERROR) } } } /** * 根据表名获取字段信息 */ @Rest("/system/listColumns") def listColumns(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/listColumns") // 参数合法性检查 val dbName = JSONUtils.getValue(json, "dbName", "memory") val tableName = JSONUtils.getValue(json, "tableName", "") if (StringUtils.isBlank(dbName) || StringUtils.isBlank(tableName)) { return ResultMsg.buildError("获取表元字段信息失败,库名和表名不能为空", ErrorCode.PARAM_ILLEGAL) } // 区分内存临时表和物理表 val columns = if ("memory".equals(dbName)) { this.baseSpark.catalog.listColumns(tableName) } else { this.baseSpark.catalog.listColumns(dbName, tableName) } // 将字段元数据信息封装 val columnList = new util.LinkedList[ColumnMeta] columns.collect().foreach(column => { val meta = new ColumnMeta.Builder().setColumnName(column.name) .setBucket(column.isBucket) .setDatabase(dbName) .setDataType(column.dataType) .setTableName(tableName) .setDescription(column.description) .setNullable(column.nullable) .setPartition(column.isPartition).build() columnList.add(meta) }) this.logger.info(s"[listColumns] 获取[$dbName.$tableName]字段信息成功:json=$json") ResultMsg.buildSuccess(columnList, s"获取[$dbName.$tableName]字段信息成功") } catch { case e: Exception => { this.logger.error(s"[log] 获取表字段信息失败:json=$json", e) ResultMsg.buildError("获取表字段信息失败", ErrorCode.ERROR) } } } /** * 获取指定数据库下所有的表信息 */ @Rest("/system/listTables") def listTables(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/listTables") // 参数合法性检查 val dbName = JSONUtils.getValue(json, "dbName", "memory") if (StringUtils.isBlank(dbName)) { return ResultMsg.buildError("获取表元数据信息失败,库名不能为空", ErrorCode.PARAM_ILLEGAL) } val tableList = new util.LinkedList[bean.TableMeta] if ("memory".equals(dbName)) { // 内存临时表元数据信息 this.baseSpark.catalog.listTables().collect().foreach(table => { if (StringUtils.isBlank(table.database)) { tableList.add(new bean.TableMeta(table.description, "memory", table.name, table.tableType, table.isTemporary)) } }) } else { // 获取hive表元数据信息 this.baseSpark.catalog.listTables(dbName).collect().foreach(table => { if (StringUtils.isNotBlank(table.database)) { tableList.add(new bean.TableMeta(table.description, table.database, table.name, table.tableType, table.isTemporary)) } }) } this.logger.info(s"[listTables] 获取[$dbName]表元数据信息成功:json=$json") ResultMsg.buildSuccess(tableList, s"获取[$dbName]表元数据信息成功") } catch { case e: Exception => { this.logger.error(s"[log] 获取表元数据信息失败:json=$json", e) ResultMsg.buildError("获取表元数据信息失败", ErrorCode.ERROR) } } } /** * 获取数据库列表 */ @Rest("/system/listDatabases") def listDatabases(request: Request, response: Response): AnyRef = { try { this.logger.info(s"Ip address ${request.ip()} request /system/listDatabases") // 获取所有的数据库名称 val dbList = new util.LinkedList[String]() this.baseSpark.catalog.listDatabases().collect().foreach(db => dbList.add(db.name)) // 由于spark临时表没有库名,此处约定memory统一作为临时表所在的库 dbList.add("memory") this.logger.info(s"[listDatabases] 获取数据库列表成功") ResultMsg.buildSuccess(dbList, "获取数据库列表成功") } catch { case e: Exception => { this.logger.error(s"[log] 获取数据库列表失败", e) ResultMsg.buildError("获取数据库列表失败", ErrorCode.ERROR) } } } /** * 获取counter累加器中的值 */ @Rest("/system/counter") def counter(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/counter") val counter = this.baseSpark.acc.getCounter this.logger.info(s"[counter] 获取单值累加器成功:counter=$counter") ResultMsg.buildSuccess(counter, "获取单值累加器成功") } catch { case e: Exception => { this.logger.error(s"[log] 获取单值累加器失败:json=$json", e) ResultMsg.buildError("获取多值累加器失败", ErrorCode.ERROR) } } } /** * 获取多值累加器中的值 */ @Rest("/system/multiCounter") def multiCounter(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/multiCounter") this.logger.info(s"[multiCounter] 获取多值累加器成功") ResultMsg.buildSuccess(this.baseSpark.acc.getMultiCounter, "获取多值累加器成功") } catch { case e: Exception => { this.logger.error(s"[log] 获取多值累加器失败:json=$json", e) ResultMsg.buildError("获取多值累加器失败", ErrorCode.ERROR) } } } /** * 获取timer累加器中的值 */ @Rest("/system/multiTimer") def multiTimer(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/multiTimer") val cells = new util.HashSet[Table.Cell[String, String, Long]]() cells.addAll(this.baseSpark.acc.getMultiTimer.cellSet()) val clear = JSONUtils.getValue(json, "clear", false) if (clear) this.baseSpark.acc.multiTimer.reset this.logger.info(s"[multiTimer] 获取timer累加器成功") ResultMsg.buildSuccess(cells, "获取timer累加器成功") } catch { case e: Exception => { this.logger.error(s"[log] 获取timer累加器失败:json=$json", e) ResultMsg.buildError("获取timer累加器失败", ErrorCode.ERROR) } } } /** * 获取运行时日志 */ @Rest("/system/log") def log(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/log") val logs = new StringBuilder("[") this.baseSpark.acc.getLog.iterator().foreach(log => { logs.append(log + ",") }) // 参数校验与参数获取 val clear = JSONUtils.getValue(json, "clear", false) if (clear) this.baseSpark.acc.logAccumulator.reset if (logs.length > 0 && logs.endsWith(",")) { this.logger.info(s"[log] 日志获取成功:json=$json") ResultMsg.buildSuccess(logs.substring(0, logs.length - 1) + "]", "日志获取成功") } else { this.logger.info(s"[log] 日志记录数为空:json=$json") ResultMsg.buildError("日志记录数为空", ErrorCode.NOT_FOUND) } } catch { case e: Exception => { this.logger.error(s"[log] 日志获取失败:json=$json", e) ResultMsg.buildError("日志获取失败", ErrorCode.ERROR) } } } /** * 获取运行时状态信息,包括GC、jvm、thread、memory、cpu等 */ @Rest("/system/env") def env(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/env") val envInfo = new StringBuilder("[") this.baseSpark.acc.getEnv.iterator().foreach(env => { envInfo.append(env + ",") }) // 参数校验与参数获取 val clear = JSONUtils.getValue(json, "clear", false) if (clear) this.baseSpark.acc.logAccumulator.reset if (envInfo.length > 0 && envInfo.endsWith(",")) { this.logger.info(s"[env] 运行时信息获取成功:json=$json") ResultMsg.buildSuccess(envInfo.substring(0, envInfo.length - 1) + "]", "运行时信息获取成功") } else { this.logger.info(s"[env] 运行时信息记录数为空:json=$json") ResultMsg.buildError("运行时信息记录数为空", ErrorCode.NOT_FOUND) } } catch { case e: Exception => { this.logger.error(s"[env] 运行时信息获取失败:json=$json", e) ResultMsg.buildError("运行时信息获取失败", ErrorCode.ERROR) } } } /** * kill 当前 Spark 任务 */ @Rest("/system/kill") def kill(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/kill") // 参数校验与参数获取 val stopGracefully = JSONUtils.getValue(json, "stopGracefully", true) this.baseSpark.after() this.baseSpark.shutdown(stopGracefully) ProcessUtil.executeCmds(s"yarn application -kill ${this.baseSpark.applicationId}", s"kill -9 ${OSUtils.getPid}") this.logger.info(s"[kill] kill任务成功:json=$json") System.exit(0) ResultMsg.buildSuccess("任务停止成功", ErrorCode.SUCCESS.toString) } catch { case e: Exception => { this.logger.error(s"[kill] 执行kill任务失败:json=$json", e) ResultMsg.buildError("执行kill任务失败", ErrorCode.ERROR) } } } /** * 取消job的执行 */ @Rest("/system/cancelJob") def cancelJob(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/cancelJob") // 参数校验与参数获取 val jobId = JSONUtils.getValue(json, "id", -1) if (jobId <= 0) { this.logger.warn(s"[cancelJob] 参数不合法:json=$json") return ResultMsg.buildError(s"参数不合法:json=$json", ErrorCode.ERROR) } this.baseSpark.sc.cancelJob(jobId, s"被管控平台kill:${DateFormatUtils.formatCurrentDateTime()}") this.logger.info(s"[cancelJob] kill job成功:json=$json") ResultMsg.buildSuccess("kill job 成功", ErrorCode.SUCCESS.toString) } catch { case e: Exception => { this.logger.error(s"[cancelJob] kill job失败:json=$json", e) ResultMsg.buildError("kill job失败", ErrorCode.ERROR) } } } /** * 取消stage的执行 */ @Rest("/system/cancelStage") def cancelStage(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/cancelStage") // 参数校验与参数获取 val stageId = JSONUtils.getValue(json, "id", -1) if (stageId <= 0) { this.logger.warn(s"[cancelStage] 参数不合法:json=$json") return ResultMsg.buildError(s"参数不合法:json=$json", ErrorCode.ERROR) } this.baseSpark.sc.cancelStage(stageId, s"被管控平台kill:${DateFormatUtils.formatCurrentDateTime()}") this.logger.info(s"[cancelStage] kill stage[${stageId}] 成功:json=$json") ResultMsg.buildSuccess("kill stage 成功", ErrorCode.SUCCESS.toString) } catch { case e: Exception => { this.logger.error(s"[cancelStage] kill stage失败:json=$json", e) ResultMsg.buildError("kill stage失败", ErrorCode.ERROR) } } } /** * 用于执行sql语句 */ @Rest(value = "/system/sql", method = "post") def sql(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/sql") // 参数校验与参数获取 val sql = JSONUtils.getValue(json, "sql", "") // sql合法性检查 if (StringUtils.isBlank(sql) || !sql.toLowerCase.trim.startsWith("select ")) { this.logger.warn(s"[sql] sql不合法,在线调试功能只支持查询操作:json=$json") return ResultMsg.buildError(s"sql不合法,在线调试功能只支持查询操作", ErrorCode.ERROR) } if (this.baseSpark == null || this.baseSpark._spark == null) { this.logger.warn(s"[sql] 系统正在初始化,请稍后再试:json=$json") return "系统正在初始化,请稍后再试" } val sqlResult = this.baseSpark._spark.sql(sql.replace("memory.", "")).limit(1000).showString() this.logger.info(s"成功执行以下查询:${sql}\n执行结果如下:\n" + sqlResult) ResultMsg.buildSuccess(sqlResult, ErrorCode.SUCCESS.toString) } catch { case e: Exception => { this.logger.error(s"[sql] 执行用户sql失败:json=$json", e) ResultMsg.buildError("执行用户sql失败,异常堆栈:" + ExceptionBus.stackTrace(e), ErrorCode.ERROR) } } } /** * 获取当前的spark运行时信息 */ @Rest("/system/sparkInfo") def sparkInfo(request: Request, response: Response): AnyRef = { val json = request.body try { this.logger.info(s"Ip address ${request.ip()} request /system/sparkInfo") if (this.sparkInfoBean == null) { this.sparkInfoBean = new SparkInfo this.sparkInfoBean.setAppName(this.baseSpark.appName) this.sparkInfoBean.setClassName(this.baseSpark.className) this.sparkInfoBean.setFireVersion(FireFrameworkConf.fireVersion) this.sparkInfoBean.setConf(this.baseSpark._spark.conf.getAll) this.sparkInfoBean.setVersion(this.baseSpark.sc.version) this.sparkInfoBean.setMaster(this.baseSpark.sc.master) this.sparkInfoBean.setApplicationId(this.baseSpark.sc.applicationId) this.sparkInfoBean.setApplicationAttemptId(this.baseSpark.sc.applicationAttemptId.getOrElse("")) this.sparkInfoBean.setUi(this.baseSpark.webUI) this.sparkInfoBean.setPid(OSUtils.getPid) this.sparkInfoBean.setLaunchTime(DateFormatUtils.formatUnixDateTime(FireUtils.launchTime * 1000)) this.sparkInfoBean.setExecutorMemory(this.baseSpark.sc.getConf.get("spark.executor.memory", "1")) this.sparkInfoBean.setExecutorInstances(this.baseSpark.sc.getConf.get("spark.executor.instances", "1")) this.sparkInfoBean.setExecutorCores(this.baseSpark.sc.getConf.get("spark.executor.cores", "1")) this.sparkInfoBean.setDriverCores(this.baseSpark.sc.getConf.get("spark.driver.cores", "1")) this.sparkInfoBean.setDriverMemory(this.baseSpark.sc.getConf.get("spark.driver.memory", "1")) this.sparkInfoBean.setDriverMemoryOverhead(this.baseSpark.sc.getConf.get("spark.yarn.driver.memoryOverhead", "0")) this.sparkInfoBean.setDriverHost(this.baseSpark.sc.getConf.get("spark.driver.host", "0")) this.sparkInfoBean.setDriverPort(this.baseSpark.sc.getConf.get("spark.driver.port", "0")) this.sparkInfoBean.setRestPort(this.baseSpark.restfulRegister.restPort.toString) this.sparkInfoBean.setExecutorMemoryOverhead(this.baseSpark.sc.getConf.get("spark.yarn.executor.memoryOverhead", "0")) this.sparkInfoBean.setProperties(PropUtils.cover) this.sparkInfoBean.computeCpuMemory() } this.sparkInfoBean.setUptime(DateFormatUtils.runTime(FireUtils.launchTime)) this.sparkInfoBean.setBatchDuration(this.baseSpark.batchDuration + "") this.sparkInfoBean.setTimestamp(DateFormatUtils.formatCurrentDateTime()) this.logger.info(s"[sparkInfo] 获取spark信息成功:json=$json") ResultMsg.buildSuccess(JSONUtils.toJSONString(this.sparkInfoBean), ErrorCode.SUCCESS.toString) } catch { case e: Exception => { this.logger.error(s"[sparkInfo] 获取spark信息失败:json=$json", e) ResultMsg.buildError("获取spark信息失败", ErrorCode.ERROR) } } } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/sink/FireSink.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.sink import com.zto.fire.spark.util.{SparkSingletonFactory, SparkUtils} import org.apache.spark.internal.Logging import org.apache.spark.sql.DataFrame import org.apache.spark.sql.execution.streaming.Sink /** * Fire框架组件sink父类 * * @author ChengLong 2019年12月23日 10:09:55 * @since 0.4.1 */ private[fire] abstract class FireSink extends Sink with Logging { @volatile protected var latestBatchId = -1L protected lazy val spark = SparkSingletonFactory.getSparkSession /** * 将内部row类型的DataFrame转为Row类型的DataFrame * * @param df * InternalRow类型的DataFrame * @return * Row类型的DataFrame */ protected def toExternalRow(df: DataFrame): DataFrame = { SparkUtils.toExternalRow(df) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/sink/JdbcStreamSink.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.sink import java.util.Objects import com.zto.fire._ import com.zto.fire.jdbc.conf.FireJdbcConf import org.apache.commons.lang3.StringUtils import org.apache.spark.sql.DataFrame /** * jdbc sink组件,支持jdbc操作 * * @param options * jdbc相关参数 * @author ChengLong 2019年12月23日 13:06:30 * @since 0.4.1 */ class JdbcStreamSink(options: Map[String, String]) extends FireSink { override def addBatch(batchId: Long, data: DataFrame): Unit = { println("latestBatchId=" + this.latestBatchId) if (batchId <= latestBatchId) { logInfo(s"Skipping already committed batch $batchId") } else { val sql = options.getOrElse("sql", "") Objects.requireNonNull(sql, "sql语句不能为空.") val fields = options.getOrElse("fields", "") val batch = options.getOrElse("batch", FireJdbcConf.batchSize() + "").toInt val keyNum = options.getOrElse("keyNum", "1").toInt this.toExternalRow(data).jdbcBatchUpdate(sql, if (StringUtils.isNotBlank(fields)) fields.split(",") else null, batch, keyNum) latestBatchId = batchId } } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/sql/SparkSqlExtensionsParserBase.scala ================================================ package com.zto.fire.spark.sql import com.zto.fire.common.util.{ExceptionBus, Logging} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan /** * 提供通用的sql解析与校验类 * @param sparkSession * @param parser */ private[fire] class SparkSqlExtensionsParserBase(sparkSession: SparkSession, parser: ParserInterface) extends Logging { /** * Parse a string to a [[LogicalPlan]]. */ def parsePlan(sqlText: String): LogicalPlan = { try { SparkSqlParser.sqlParse(sqlText) parser.parsePlan(sqlText) } catch { case e: Throwable => ExceptionBus.post(e, sqlText) throw e } } /** * Parse a string to an [[Expression]]. */ def parseExpression(sqlText: String): Expression = parser.parseExpression(sqlText) } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/sql/SparkSqlParserBase.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.sql import com.zto.fire._ import com.zto.fire.common.anno.Internal import com.zto.fire.common.bean.TableIdentifier import com.zto.fire.common.enu.{Datasource, Operation} import com.zto.fire.common.util.SQLLineageManager import com.zto.fire.core.sql.SqlParser import com.zto.fire.predef.JConcurrentHashMap import com.zto.fire.spark.util.{SparkSingletonFactory, SparkUtils} import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.{TableIdentifier => SparkTableIdentifier} import org.apache.spark.sql.execution.datasources.CreateTable /** * Spark SQL解析器父类,封装各个spark版本通用的api * * @author ChengLong 2022-09-07 15:31:03 * @since 2.3.2 */ @Internal private[fire] trait SparkSqlParserBase extends SqlParser { protected lazy val spark = SparkSingletonFactory.getSparkSession protected lazy val catalog = this.spark.sessionState.catalog protected lazy val hiveTableMetaDataMap = new JConcurrentHashMap[String, CatalogTable]() /** * 用于判断给定的表是否为hive表 * * @param tableIdentifier 库表 */ @Internal protected def getCatalog(tableIdentifier: TableIdentifier): Datasource = { val isHive = this.isHiveTable(tableIdentifier) if (isHive) Datasource.HIVE else Datasource.VIEW } /** * 将Fire的TableIdentifier转为Spark的TableIdentifier */ @Internal private[fire] def toSparkTableIdentifier(tableIdentifier: TableIdentifier): SparkTableIdentifier = { val db = if (isEmpty(tableIdentifier.database)) None else Some(tableIdentifier.database) SparkTableIdentifier(tableIdentifier.table, db) } /** * 将Spark的TableIdentifier转为Fire的TableIdentifier */ @Internal private[fire] def toFireTableIdentifier(tableIdentifier: SparkTableIdentifier): TableIdentifier = { TableIdentifier(tableIdentifier.unquotedString) } /** * 用于判断表是否存在 */ @Internal private[fire] def tableExists(tableIdentifier: TableIdentifier): Boolean = { tryWithReturn { this.catalog.tableExists(toSparkTableIdentifier(tableIdentifier)) } (this.logger, catchLog = s"判断${tableIdentifier}是否存在发生异常", hook = false) } /** * 用于判断给定的表是否为临时表 */ @Internal override def isTempView(tableIdentifier: TableIdentifier): Boolean = { tryWithReturn { catalog.isTemporaryTable(toSparkTableIdentifier(tableIdentifier)) } (this.logger, catchLog = s"判断${tableIdentifier}是否为临时表或视图失败", hook = false) } /** * 用于判断给定的表是否为hive表 */ @Internal override def isHiveTable(tableIdentifier: TableIdentifier): Boolean = { this.hiveTableMap.mergeGet(tableIdentifier.identifier) { if (this.isTempView(tableIdentifier) || !this.tableExists(tableIdentifier)) return false tryWithReturn { val hiveTable = this.hiveTableMetaDataMap.mergeGet(tableIdentifier.identifier) { catalog.getTableMetadata(toSparkTableIdentifier(tableIdentifier)) } if (hiveTable.provider.isDefined && "hive".equals(hiveTable.provider.get)) true else false } (this.logger, catchLog = s"判断${tableIdentifier}是否为hive表失败", hook = false) } } /** * 将解析到的表信息添加到实时血缘中 */ @Internal protected def addCatalog(identifierSeq: Seq[String], operation: Operation): Unit = { val identifier = this.toTableIdentifier(identifierSeq) this.addCatalog(identifier, operation) } /** * 将解析到的表信息添加到实时血缘中 */ @Internal protected def addCatalog(identifier: TableIdentifier, operation: Operation): Unit = { SQLLineageManager.setCatalog(identifier, this.getCatalog(identifier).toString) SQLLineageManager.setOperation(identifier, operation.toString) if (this.isTempView(identifier)) { SQLLineageManager.setTmpView(identifier, identifier.toString()) } if (this.isHiveTable(identifier)) { val metadata = this.hiveTableMetaDataMap.get(identifier.toString) if (metadata != null) { val url = metadata.storage.locationUri if (url.isDefined) SQLLineageManager.setCluster(identifier, url.get.toString) // 添加表属性信息 SQLLineageManager.setOptions(identifier, metadata.properties) // 添加字段信息 val columns = metadata.schema.map(field => (field.name, field.dataType.toString)) if (columns.nonEmpty) SQLLineageManager.setColumns(identifier, columns) // 表注释信息 if (metadata.comment.isDefined) SQLLineageManager.setComment(identifier, metadata.comment.get) } } } /** * 获取库表名 * * @param tableName 解析后的表信息 */ @Internal protected def toTableIdentifier(tableName: Seq[String]): TableIdentifier = { if (tableName.size > 1) TableIdentifier(tableName(1), tableName.head) else if (tableName.size == 1) TableIdentifier(tableName.head) else TableIdentifier("") } /** * SQL语法校验 * @param sql * sql statement * @return * true:校验成功 false:校验失败 */ def sqlLegal(sql: String): Boolean = SparkUtils.sqlLegal(sql) /** * 用于解析SparkSql中的库表信息 */ @Internal override def sqlParser(sql: String): Unit = { if (isEmpty(sql)) return tryWithLog { this.logger.debug(s"开始解析sql语句:$sql") SparkUtils.sqlValidate(sql) val logicalPlan = this.spark.sessionState.sqlParser.parsePlan(sql) SQLLineageManager.addStatement(sql) val sinkTable = this.ddlParser(logicalPlan) this.queryParser(logicalPlan, sinkTable) } (this.logger, catchLog = s"可忽略异常:实时血缘解析SQL报错,SQL:\n$sql", hook = false) } /** * 用于解析查询sql中的库表信息 * * @param sinkTable * 当insert xxx select或create xxx select语句时,sinkTable不为空 */ @Internal protected def queryParser(logicalPlan: LogicalPlan, sinkTable: Option[TableIdentifier]): Unit /** * 用于解析DDL语句中的库表、分区信息 * * @return 返回sink目标表,用于维护表与表之间的关系 */ @Internal protected def ddlParser(logicalPlan: LogicalPlan): Option[TableIdentifier] } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/sql/SqlExtensions.scala ================================================ package com.zto.fire.spark.sql import com.zto.fire.core.sql.SqlExtensionsParser import com.zto.fire.spark.conf.FireSparkConf import org.apache.spark.sql.{SparkSession, SparkSessionExtensions} import org.apache.spark.sql.catalyst.parser.ParserInterface /** * spark sql语法扩展 * @author ChengLong * @date 2022-05-09 14:45:15 * @since 2.2.2 */ private[fire] object SqlExtensions extends SqlExtensionsParser { /** * 启用自定义Sql解析器扩展 */ def sqlExtension(sessionBuilder: SparkSession.Builder): Unit = { if (FireSparkConf.sqlExtensionsEnable) { type ParserBuilder = (SparkSession, ParserInterface) => ParserInterface type ExtensionsBuilder = SparkSessionExtensions => Unit val parserBuilder: ParserBuilder = (sparkSession, parser) => new SparkSqlExtensionsParser(sparkSession, parser) val extBuilder: ExtensionsBuilder = { e => e.injectParser(parserBuilder) } sessionBuilder.withExtensions(extBuilder) } } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/sync/DistributeSyncManager.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.sync import com.zto.fire.predef._ import com.zto.fire.common.util.OSUtils import com.zto.fire.core.sync.SyncManager import com.zto.fire.spark.util.SparkSingletonFactory import java.util.concurrent.atomic.AtomicInteger /** * Spark分布式数据同步管理器,用于将数据从Driver端同步至每一个executor端 * * @author ChengLong 2021-11-3 14:14:51 * @since 2.2.0 */ object DistributeSyncManager extends SyncManager { private[this] lazy val initExecutors = new AtomicInteger(0) private[this] lazy val spark = SparkSingletonFactory.getSparkSession private[this] lazy val sc = this.spark.sparkContext /** * 获取当前任务的executor数 */ private[fire] def getInitExecutors: Int = { if (this.initExecutors.get() == 0) this.initExecutors.set(this.sc.getConf.get("spark.executor.instances", if (OSUtils.isLinux) "1000" else "10").toInt) this.initExecutors.get() } /** * 根据当前executor数量,创建10倍于executor以上的数据集,并利用foreachPartition将给定的逻辑发送到每一个executor端执行 * * @param fun * 在每一个executor端执行fun逻辑 * @param isAsync * 是否以异步的方式执行 */ def sync(fun: => Unit, isAsync: Boolean = true): Unit = { if (isEmpty(SparkSingletonFactory.getSparkSession)) return val executorNum = this.getInitExecutors val rdd = this.sc.parallelize(1 to executorNum * 10, executorNum * 3) if (isAsync) { rdd.foreachPartitionAsync(_ => { tryWithLog(fun)(this.logger, tryLog = "Synchronizing data to the executor is complete.", catchLog = "Synchronizing data to the executor is failed.") }) } else { rdd.foreachPartition(_ => { tryWithLog(fun)(this.logger, tryLog = "Synchronizing data to the executor is complete.", catchLog = "Synchronizing data to the executor is failed.") }) } } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/sync/SparkLineageAccumulatorManager.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.sync import com.zto.fire.common.bean.lineage.Lineage import com.zto.fire.common.enu.Datasource import com.zto.fire.common.util.{DatasourceDesc, SQLLineageManager} import com.zto.fire.core.sync.LineageAccumulatorManager import com.zto.fire.predef._ import com.zto.fire.spark.acc.AccumulatorManager /** * 用于将各个executor端数据收集到driver端 * * @author ChengLong 2022-08-24 14:31:08 * @since 2.3.2 */ object SparkLineageAccumulatorManager extends LineageAccumulatorManager { /** * 将血缘信息放到累加器中 */ override def add(lineage: JConcurrentHashMap[Datasource, JHashSet[DatasourceDesc]]): Unit = { AccumulatorManager.addLineage(lineage) } /** * 累加Long类型数据 */ override def add(value: Long): Unit = AccumulatorManager.addCounter(value) /** * 获取收集到的血缘消息 */ override def getValue: Lineage = { new Lineage(AccumulatorManager.getLineage, SQLLineageManager.getSQLLineage) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/sync/SyncSparkEngine.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.sync import com.zto.fire._ import com.zto.fire.common.bean.lineage.Lineage import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.enu.Datasource import com.zto.fire.common.util.{DatasourceDesc, Logging, PropUtils} import com.zto.fire.core.sync.SyncEngineConf import com.zto.fire.spark.acc.AccumulatorManager import com.zto.fire.spark.util.SparkUtils import org.apache.spark.broadcast.Broadcast import org.apache.spark.{SparkConf, SparkContext, SparkEnv} /** * 获取Spark引擎的所有配置信息 * * @author ChengLong * @since 2.0.0 * @create 2021-03-02 10:57 */ private[fire] class SyncSparkEngine extends SyncEngineConf { /** * 获取引擎的所有配置信息 */ override def syncEngineConf: Map[String, String] = { if (SparkUtils.isExecutor) { SparkEnv.get.conf.getAll.toMap } else { Map.empty[String, String] } } /** * 在master端获取系统累加器中的数据 */ override def syncLineage: Lineage = { SparkLineageAccumulatorManager.getValue } /** * 同步引擎各个container的信息到累加器中 */ override def collect: Unit = { if (SparkUtils.isDriver && isCollect.compareAndSet(false, true)) AccumulatorManager.collectLineage } } object SyncSparkEngine extends Logging { // 用于广播spark配置信息 private[fire] var broadcastConf: Broadcast[SparkConf] = _ /** * 将最新的配置信息以广播的方式同步给每一个executor */ private[fire] def syncDynamicConf(sc: SparkContext, conf: SparkConf): Unit = { if (sc != null && conf != null && FireFrameworkConf.dynamicConf) { val broadcastConf = sc.broadcast(conf) this.broadcastConf = broadcastConf DistributeSyncManager.sync({ this.broadcastConf = broadcastConf this.broadcastConf.value.getAll.foreach(kv => { PropUtils.setProperty(kv._1, kv._2) }) this.logger.info("The Executor side configuration has been reloaded.") }) this.logger.info("The Driver side configuration has been reloaded.") } } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/task/SparkInternalTask.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.task import com.zto.fire.common.anno.Scheduled import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.util.{JSONUtils, MQProducer} import com.zto.fire.core.task.FireInternalTask import com.zto.fire.spark.BaseSpark import com.zto.fire.spark.sync.SparkLineageAccumulatorManager /** * 定时任务调度器,用于定时执行Spark框架内部指定的任务 * * @author ChengLong 2019年11月5日 10:11:31 */ private[fire] class SparkInternalTask(baseSpark: BaseSpark) extends FireInternalTask(baseSpark) { /** * 定时采集运行时的jvm、gc、thread、cpu、memory、disk等信息 * 并将采集到的数据存放到EnvironmentAccumulator中 */ @Scheduled(fixedInterval = 60000, scope = "all", initialDelay = 60000L, concurrent = false) override def jvmMonitor: Unit = super.jvmMonitor /** * 实时血缘发送定时任务,定时将血缘信息发送到kafka中 */ @Scheduled(fixedInterval = 60000, initialDelay = 10000, repeatCount = 360) override def lineage: Unit = { sendLineage this.registerLineageHook(sendLineage) def sendLineage: Unit = { if (FireFrameworkConf.lineageEnable && FireFrameworkConf.lineageSendMqEnable) { MQProducer.sendKafka(FireFrameworkConf.lineageMQUrl, FireFrameworkConf.lineageTopic, JSONUtils.toJSONString(SparkLineageAccumulatorManager.getValue)) } } } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/udf/UDFs.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.udf import java.util.Date import com.zto.fire.common.util.{DateFormatUtils, NumberFormatUtils} import org.apache.commons.lang3.StringUtils import org.apache.spark.sql.SparkSession /** * 通用的自定义UDF工具函数集合 * Created by ChengLong on 2017-01-06. */ object UDFs extends Serializable { /** * 批量注册系统内置的udf函数 */ def registerSysUDF(spark: SparkSession): Unit = { // ==================== 日期相关 ==================== spark.udf.register("addTimer", Timer.addTimer _) spark.udf.register("addYears", Timer.addYears _) spark.udf.register("addMons", Timer.addMons _) spark.udf.register("addDays", Timer.addDays _) spark.udf.register("addHours", Timer.addHours _) spark.udf.register("addMins", Timer.addMins _) spark.udf.register("addSecs", Timer.addSecs _) spark.udf.register("dateSchemaFormat", Timer.dateSchemaFormat _) spark.udf.register("dateStrSchemaFormat", Timer.dateStrSchemaFormat _) spark.udf.register("isSameDay", Timer.isSameDay _) spark.udf.register("isBig", Timer.isBig _) spark.udf.register("isSmall", Timer.isSmall _) spark.udf.register("isBetween", Timer.isBetween _) spark.udf.register("date", Timer.date _) spark.udf.register("interval", Timer.interval _) spark.udf.register("runTime", Timer.runTime _) spark.udf.register("truncateMinute", Timer.truncateMinute _) spark.udf.register("truncateHour", Timer.truncateHour _) // ==================== 字符串相关 ==================== spark.udf.register("isNull", Str.isNull _) spark.udf.register("isNotNull", Str.isNotNull _) spark.udf.register("len", Str.len _) spark.udf.register("reverse", Str.reverse _) spark.udf.register("contains", Str.contains _) // ==================== 数字相关 ==================== spark.udf.register("floor", Num.floor _) spark.udf.register("long2Int", Num.long2Int _) spark.udf.register("bigDecimal2Long", Num.bigDecimal2Long _) spark.udf.register("ifnull", Num.ifnull _) spark.udf.register("truncate", Num.truncate _) spark.udf.register("truncate_decimal", Num.truncateDecimal _) } /** * 时间相关的udf函数 * 时间戳格式为:yyyy-MM-dd hh:mm:ss */ object Timer { /** * 指定时间字段,对日期进行加减 * * @param field * 'year'、'month'、'day'、'hour'、'minute'、'second' * @param dateTimeStr * 格式:yyyy-MM-dd hh:mm:ss * @param count * 正负数 * @return * 计算后的日期 */ def addTimer(field: String, dateTimeStr: String, count: Int): String = { DateFormatUtils.addTimer(field, dateTimeStr, count) } /** * 对指定的时间字段进行年度加减 */ def addYears(dateTimeStr: String, years: Int): String = { DateFormatUtils.addYears(dateTimeStr, years) } /** * 对指定的时间字段进行月份加减 */ def addMons(dateTimeStr: String, mons: Int): String = { DateFormatUtils.addMons(dateTimeStr, mons) } /** * 对指定的时间字段进行天加减 */ def addDays(dateTimeStr: String, days: Int): String = { DateFormatUtils.addDays(dateTimeStr, days) } /** * 对指定的时间字段进行天加减 */ def addWeeks(dateTimeStr: String, weeks: Int): String = { DateFormatUtils.addWeeks(dateTimeStr, weeks) } /** * 对指定的时间字段进行小时加减 */ def addHours(dateTimeStr: String, hours: Int): String = { DateFormatUtils.addHours(dateTimeStr, hours) } /** * 对指定的时间字段进行分钟加减 */ def addMins(dateTimeStr: String, minutes: Int): String = { DateFormatUtils.addMins(dateTimeStr, minutes) } /** * 对指定的时间字段进行秒钟加减 */ def addSecs(dateTimeStr: String, seconds: Int): String = { DateFormatUtils.addSecs(dateTimeStr, seconds) } /** * 对字段进行格式转换 */ def dateStrSchemaFormat(dateTimeStr: String, srcSchema: String, destSchema: String): String = { if (StringUtils.isBlank(dateTimeStr)) "" else DateFormatUtils.dateSchemaFormat(dateTimeStr, srcSchema, destSchema) } /** * 获取两个时间间隔的毫秒数 * * @param before * 开始时间(小) * @param after * 结束时间(大) * @return */ def interval(before: String, after: String): Long = { DateFormatUtils.interval(before, after) } /** * 计算运行时长 * * @param time * 形如:3日11时21分15秒 */ def runTime(time: Long): String = { DateFormatUtils.runTime(time) } /** * 判断两个字段是否为同一天 */ def isSameDay(day1: String, day2: String): Boolean = { DateFormatUtils.isSameDay(day1, day2) } /** * day1是否大于day2 */ def isBig(day1: String, day2: String): Boolean = { DateFormatUtils.isBig(day1, day2) } /** * day1是否小于day2 */ def isSmall(day1: String, day2: String): Boolean = { DateFormatUtils.isSmall(day1, day2) } /** * 指定字段日期是否介于day1与day2之间 */ def isBetween(day: String, day1: String, day2: String) = { DateFormatUtils.isBetween(day, day1, day2) } /** * 截取到年月日 */ def date(dateTime: String): String = { if (StringUtils.isNotBlank(dateTime) && dateTime.length > 10) dateTime.substring(0, 10) else dateTime } /** * 对字段进行格式转换 */ def dateSchemaFormat(dateTime: Date, srcSchema: String, destSchema: String): String = { this.dateStrSchemaFormat(DateFormatUtils.formatDateTime(dateTime), srcSchema, destSchema) } /** * 将yyyy-MM-dd hh:mm:ss类型日期truncate为分钟 */ def truncateMinute(dateTime: String): String = { DateFormatUtils.truncateMinute(dateTime) } /** * 获取整点小时 */ def truncateHour(dateStr: String): String = { DateFormatUtils.truncateHour(dateStr) } } /** * 对字段进行字符串相关操作 */ object Str { /** * 如果字段为空,则返回true,否则返回false */ def isNull(field: String): Boolean = { if (StringUtils.isBlank(field) || field.trim.length() == 0 || "null".equalsIgnoreCase(field.trim) || """\N""".equalsIgnoreCase(field.trim)) { true } else { false } } /** * 如果字段为空,则返回false,否则返回true */ def isNotNull(field: String): Boolean = { !isNull(field) } /** * 计算长度 */ def len(field: String): Int = { if (this.isNull(field)) 0 else field.length } /** * 字符串反转 */ def reverse(str: String): String = { StringUtils.reverse(str) } /** * 是否包含 * * @param field * 字段名称 * @param str * 包含的字符串 * @return */ def contains(field: String, str: String): Boolean = { if (StringUtils.isBlank(field) || StringUtils.isBlank(str)) { false } else { field.contains(str) } } } /** * 数值相关 */ object Num { /** * floor操作 */ def floor(field: Double): Int = { NumberFormatUtils.floor(field) } /** * 将Long转为Integer */ def long2Int(field: java.lang.Long): java.lang.Integer = { NumberFormatUtils.long2Int(field) } /** * 将BigDecimal转为Long类型 */ def bigDecimal2Long(field: java.math.BigDecimal): java.lang.Long = { NumberFormatUtils.bigDecimal2Long(field) } /** * 判断是否为空 */ def ifnull(decimal: java.math.BigDecimal, defaultVal: java.math.BigDecimal): java.math.BigDecimal = { NumberFormatUtils.ifnull(decimal, defaultVal) } /** * 类似于round,但不会四舍五入 * * @param value * 目标值 * @param scale * 精度 * @return */ def truncate(value: Double, scale: Int): Double = { NumberFormatUtils.truncate(value, scale) } /** * 截取精度 * * @param scale * 精度 * @return */ def truncateDecimal(bigDecimal: java.math.BigDecimal, scale: Int): java.math.BigDecimal = { NumberFormatUtils.truncateDecimal(bigDecimal, scale) } } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/util/RocketMQUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.util import com.zto.fire._ import com.zto.fire.common.conf.FireRocketMQConf import com.zto.fire.common.util.{LogUtils, Logging, StringsUtils} import org.apache.commons.lang3.StringUtils import org.apache.rocketmq.spark.{ConsumerStrategy, RocketMQConfig} /** * RocketMQ相关工具类 * * @author ChengLong * @since 1.0.0 * @create 2020-06-29 10:50 */ object RocketMQUtils extends Logging { /** * rocketMQ配置信息 * * @param groupId * 消费组 * @return * rocketMQ相关配置 */ def rocketParams(rocketParam: JMap[String, String] = null, groupId: String = null, rocketNameServer: String = null, tag: String = null, keyNum: Int = 1): JMap[String, String] = { val optionParams = if (rocketParam != null) rocketParam else new JHashMap[String, String]() if (StringUtils.isNotBlank(groupId)) optionParams.put(RocketMQConfig.CONSUMER_GROUP, groupId) // rocket name server 配置 val confNameServer = FireRocketMQConf.rocketNameServer(keyNum) val finalNameServer = if (StringUtils.isNotBlank(confNameServer)) confNameServer else rocketNameServer if (StringUtils.isNotBlank(finalNameServer)) optionParams.put(RocketMQConfig.NAME_SERVER_ADDR, finalNameServer) // tag配置 val confTag = FireRocketMQConf.rocketConsumerTag(keyNum) val finalTag = if (StringUtils.isNotBlank(confTag)) confTag else tag if (StringUtils.isNotBlank(finalTag)) optionParams.put(RocketMQConfig.CONSUMER_TAG, finalTag) else optionParams.put(RocketMQConfig.CONSUMER_TAG, RocketMQConfig.DEFAULT_TAG) // 每个分区拉取的消息数 val maxSpeed = FireRocketMQConf.rocketPullMaxSpeedPerPartition(keyNum) if (StringUtils.isNotBlank(maxSpeed) && StringsUtils.isInt(maxSpeed)) optionParams.put(RocketMQConfig.MAX_PULL_SPEED_PER_PARTITION, maxSpeed) // 以spark.rocket.conf.开头的配置优先级最高 val confMap = FireRocketMQConf.rocketConfMap(keyNum) if (confMap.nonEmpty) optionParams.putAll(confMap) // 日志记录RocketMQ的配置信息 LogUtils.logMap(this.logger, optionParams.toMap, s"RocketMQ configuration. keyNum=$keyNum.") optionParams } /** * 根据消费位点字符串获取ConsumerStrategy实例 * @param offset * latest/earliest */ def valueOfStrategy(offset: String): ConsumerStrategy = { if ("latest".equalsIgnoreCase(offset)) { ConsumerStrategy.lastest } else { ConsumerStrategy.earliest } } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/util/SparkSingletonFactory.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.util import com.zto.fire.core.util.SingletonFactory import com.zto.fire.hbase.HBaseConnector import com.zto.fire.hbase.conf.FireHBaseConf import com.zto.fire.spark.connector.HBaseBulkConnector import org.apache.commons.lang3.StringUtils import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.StreamingContext /** * 单例工厂,用于创建单例的对象 * Created by ChengLong on 2018-04-25. */ object SparkSingletonFactory extends SingletonFactory { private[this] var sparkSession: SparkSession = _ private[this] var streamingContext: StreamingContext = _ @transient private[this] var hbaseContext: HBaseBulkConnector = _ /** * 获取SparkSession实例 * * @return * SparkSession实例 */ def getSparkSession: SparkSession = this.synchronized { this.sparkSession } /** * SparkSession赋值 */ private[fire] def setSparkSession(sparkSession: SparkSession): Unit = this.synchronized { require(sparkSession != null, "SparkSession实例不能为空") this.sparkSession = sparkSession } /** * 设置StreamingContext * 允许重复赋值,兼容热重启导致的StreamingContext重新被创建 */ private[fire] def setStreamingContext(ssc: StreamingContext): Unit = this.synchronized { require(ssc != null, "StreamingContext实例不能为空") this.streamingContext = ssc } /** * 获取StreamingContext实例 */ def getStreamingContext: StreamingContext = this.synchronized { assert(this.streamingContext != null, "StreamingContext还没初始化,请稍后再试") this.streamingContext } /** * 获取单例的HBaseContext对象 * * @param sparkContext * SparkContext实例 * @return */ def getHBaseContextInstance(sparkContext: SparkContext, keyNum: Int = 1): HBaseBulkConnector = this.synchronized { if (this.hbaseContext == null && StringUtils.isNotBlank(FireHBaseConf.hbaseCluster())) { this.hbaseContext = new HBaseBulkConnector(sparkContext, HBaseConnector.getConfiguration(keyNum)) } this.hbaseContext } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire/spark/util/SparkUtils.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.util import com.zto.fire._ import com.zto.fire.common.anno.FieldName import com.zto.fire.common.conf.{FireFrameworkConf, FireHiveConf} import com.zto.fire.common.util._ import com.zto.fire.jdbc.conf.FireJdbcConf import com.zto.fire.spark.conf.FireSparkConf import org.apache.commons.lang3.StringUtils import org.apache.spark.SparkEnv import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils import org.apache.spark.sql.jdbc.JdbcDialects import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import java.lang.reflect.Field import java.sql.ResultSet import scala.collection.mutable.{ArrayBuffer, ListBuffer} import scala.util.Try /** * Spark 相关的工具类 * Created by ChengLong on 2016-11-24. */ object SparkUtils extends Logging { private lazy val spark = SparkSingletonFactory.getSparkSession /** * SQL语法校验,如果语法错误,则返回错误堆栈 * @param sql * sql statement */ def sqlValidate(sql: String): Try[Unit] = { val retVal = Try { val logicalPlan = this.spark.sessionState.sqlParser.parsePlan(sql) SimpleAnalyzer.checkAnalysis(logicalPlan) } if (retVal.isFailure) { ExceptionBus.post(retVal.failed.get, sql) } retVal } /** * SQL语法校验 * @param sql * sql statement * @return * true:校验成功 false:校验失败 */ def sqlLegal(sql: String): Boolean = this.sqlValidate(sql).isSuccess /** * 将Row转为自定义bean,以JavaBean中的Field为基准 * bean中的field名称要与DataFrame中的field名称保持一致 */ def sparkRowToBean[T](row: Row, clazz: Class[T]): T = { val obj = clazz.newInstance() if (row != null && clazz != null) { tryWithLog { clazz.getDeclaredFields.foreach(field => { ReflectionUtils.setAccessible(field) val anno = field.getAnnotation(classOf[FieldName]) // 如果没有加注解,或者加了注解但没有打disuse=true if (anno == null || (anno != null && !anno.disuse())) { val fieldName = if (anno != null && StringUtils.isNotBlank(anno.value())) anno.value() else field.getName tryWithLog { if (this.containsColumn(row, fieldName.trim)) { val index = row.fieldIndex(fieldName.trim) val fieldType = field.getType if (fieldType eq classOf[String]) field.set(obj, row.getString(index)) else if (fieldType eq classOf[java.lang.Integer]) field.set(obj, row.getAs[IntegerType](index)) else if (fieldType eq classOf[java.lang.Double]) field.set(obj, row.getAs[DoubleType](index)) else if (fieldType eq classOf[java.lang.Long]) field.set(obj, row.getAs[LongType](index)) else if (fieldType eq classOf[java.math.BigDecimal]) field.set(obj, row.getAs[DecimalType](index)) else if (fieldType eq classOf[java.lang.Float]) field.set(obj, row.getAs[FloatType](index)) else if (fieldType eq classOf[java.lang.Boolean]) field.set(obj, row.getAs[BooleanType](index)) else if (fieldType eq classOf[java.lang.Short]) field.set(obj, row.getAs[ShortType](index)) else if (fieldType eq classOf[java.util.Date]) field.set(obj, row.getAs[DateType](index)) } }(this.logger, catchLog = s"sparkRowToBean转换失败,${fieldName}字段类型不匹配,请检查") } }) }(this.logger, catchLog = s"sparkRowToBean转换失败,请确定JavaBean中的字段是否与Row保持一致") } obj } /** * 将SparkRow迭代映射为对象的迭代 * * @param it * Row迭代器 * @param clazz * 待映射的自定义JavaBean * @tparam T * 泛型 * @return * 映射为对象的集合 */ def sparkRowToBean[T](it: Iterator[Row], clazz: Class[T], toUppercase: Boolean = false): Iterator[T] = { /** * 用于索引给定的字段名称在Row中的index * 同时兼容标注了@FieldName的字段可以被正常索引到 */ def fieldIndex(row: Row, fieldName: String, annoFieldName: String): Int = { try { row.fieldIndex(annoFieldName) } catch { case _: Exception => { try { row.fieldIndex(fieldName) } catch { case e: Exception => { this.logger.error(s"将Spark Row转JavaBean失败,未能匹配${fieldName}或${annoFieldName}", e) -1 } } } } } val list = ListBuffer[T]() if (it != null && clazz != null) { tryWithLog { val fields = clazz.getDeclaredFields it.foreach(row => { val obj = clazz.newInstance() fields.foreach(field => { ReflectionUtils.setAccessible(field) val anno = field.getAnnotation(classOf[FieldName]) // 如果没有加注解,或者加了注解但没有打disuse=true if (anno == null || (anno != null && !anno.disuse())) { var fieldName = if (anno != null && StringUtils.isNotBlank(anno.value())) anno.value() else field.getName fieldName = if (toUppercase) fieldName.toUpperCase else fieldName // 兼容标注了@FieldName的字段 if (this.containsColumn(row, fieldName) || this.containsColumn(row, field.getName)) { val index = fieldIndex(row, field.getName, fieldName.trim) if (index >= 0) { val fieldType = field.getType tryWithLog { if (fieldType eq classOf[String]) field.set(obj, row.getString(index)) else if (fieldType eq classOf[java.lang.Integer]) field.set(obj, row.getAs[IntegerType](index)) else if (fieldType eq classOf[java.lang.Long]) field.set(obj, row.getAs[LongType](index)) else if (fieldType eq classOf[java.math.BigDecimal]) field.set(obj, row.getAs[DecimalType](index)) else if (fieldType eq classOf[java.lang.Boolean]) field.set(obj, row.getAs[BooleanType](index)) else if (fieldType eq classOf[java.lang.Double]) field.set(obj, row.getAs[DoubleType](index)) else if (fieldType eq classOf[java.lang.Float]) field.set(obj, row.getAs[FloatType](index)) else if (fieldType eq classOf[java.lang.Short]) field.set(obj, row.getAs[ShortType](index)) else if (fieldType eq classOf[java.util.Date]) field.set(obj, row.getAs[DateType](index)) }(this.logger, catchLog = s"sparkRowToBean转换失败,${fieldName}字段类型不匹配,请检查") } } } }) list += obj }) }(this.logger, catchLog = "sparkRowToBean转换失败,请确定JavaBean中的字段是否与Row保持一致") } list.iterator } /** * 判断指定的Row中是否包含指定的列名 * * @param row * DataFrame中的行 * @param columnName * 列名 * @return * true: 存在 false:不存在 */ def containsColumn(row: Row, columnName: String): Boolean = { Try { try { row.fieldIndex(columnName) } }.isSuccess } /** * 将jdbc查询结果集转为DataFrame * * @param rs * jdbc查询的结果集 * @return */ def resultSet2DataFrame(rs: ResultSet, keyNum: Int = 1): DataFrame = { val rows = this.resultSet2Rows(rs) val structFields = JdbcUtils.getSchema(rs, JdbcDialects.get(FireJdbcConf.jdbcUrl(keyNum)), true) this.spark.createDataFrame(rows, structFields) } /** * 将ResultSet集合转为Row集合 * * @param rs * jdbc查询结果集 * @return * Spark Row */ def resultSet2Rows(rs: ResultSet): List[Row] = { val fieldCount = rs.getMetaData.getColumnCount val rows = ListBuffer[Row]() while (rs.next()) { val row = ArrayBuffer[Any]() (1 to fieldCount).foreach(index => { val value = rs.getObject(index) row += value }) val tmpRow = Row(row: _*) rows += tmpRow } rows.toList } /** * 根据实体bean构建schema信息 * * @return StructField集合 */ def buildSchemaFromBean(beanClazz: Class[_], upper: Boolean = false): List[StructField] = { val fieldMap = ReflectionUtils.getAllFields(beanClazz) val strutFields = new ListBuffer[StructField]() for (map <- fieldMap.entrySet) { val field: Field = map.getValue val fieldType: Class[_] = field.getType val anno: FieldName = field.getAnnotation(classOf[FieldName]) var fieldName: String = map.getKey var nullable: Boolean = true val disuse = if (anno == null) { false } else { if (StringUtils.isNotBlank(anno.value)) { fieldName = anno.value } nullable = anno.nullable() anno.disuse() } if (!disuse) { if (upper) fieldName = fieldName.toUpperCase tryWithLog { if (fieldType eq classOf[String]) strutFields += DataTypes.createStructField(fieldName, DataTypes.StringType, nullable) else if (fieldType eq classOf[java.lang.Integer]) strutFields += DataTypes.createStructField(fieldName, DataTypes.IntegerType, nullable) else if (fieldType eq classOf[java.lang.Double]) strutFields += DataTypes.createStructField(fieldName, DataTypes.DoubleType, nullable) else if (fieldType eq classOf[java.lang.Long]) strutFields += DataTypes.createStructField(fieldName, DataTypes.LongType, nullable) else if (fieldType eq classOf[java.math.BigDecimal]) strutFields += DataTypes.createStructField(fieldName, DataTypes.DoubleType, nullable) else if (fieldType eq classOf[java.lang.Float]) strutFields += DataTypes.createStructField(fieldName, DataTypes.FloatType, nullable) else if (fieldType eq classOf[java.lang.Boolean]) strutFields += DataTypes.createStructField(fieldName, DataTypes.BooleanType, nullable) else if (fieldType eq classOf[java.lang.Short]) strutFields += DataTypes.createStructField(fieldName, DataTypes.ShortType, nullable) else if (fieldType eq classOf[java.util.Date]) strutFields += DataTypes.createStructField(fieldName, DataTypes.DateType, nullable) }(this.logger, catchLog = s"buildSchemaFromBean失败,异常字段名:${fieldName}") } } strutFields.toList } /** * 获取kafka中json数据的before和after信息 * * @param beanClazz * json数据对应的java bean类型 * @param isMySQL * 是否为mysql解析的消息 * @param fieldNameUpper * 字段名称是否为大写 * @param parseAll * 是否解析所有字段信息 * @return */ def buildSchema2Kafka(beanClazz: Class[_], parseAll: Boolean = false, isMySQL: Boolean = true, fieldNameUpper: Boolean = false): StructType = { if (parseAll) { val structTypes = new StructType() .add("table", StringType) .add("op_type", StringType) .add("op_ts", StringType) .add("current_ts", StringType) .add("gtid", StringType) .add("logFile", StringType) .add("offset", StringType) .add("schema", StringType) .add("when", StringType) .add("after", StructType(SparkUtils.buildSchemaFromBean(beanClazz, fieldNameUpper))) .add("before", StructType(SparkUtils.buildSchemaFromBean(beanClazz, fieldNameUpper))) if (isMySQL) structTypes.add("pos", LongType) else structTypes.add("pos", StringType) } else { new StructType().add("table", StringType) .add("after", StructType(SparkUtils.buildSchemaFromBean(beanClazz, fieldNameUpper))) } } /** * 获取表的全名 * * @param dbName * 表所在的库名 * @param tableName * 表名 * @return * 库名.表名 */ def getFullTableName(dbName: String = FireHiveConf.defaultDB, tableName: String): String = { val dbNameStr = if (StringUtils.isBlank(dbName)) FireHiveConf.defaultDB else dbName s"$dbNameStr.$tableName" } /** * 分割topic列表,返回set集合 * * @param topics * 多个topic以指定分隔符分割 * @return */ def topicSplit(topics: String, splitStr: String = ","): Set[String] = { requireNonEmpty(topics)("topic不能为空,请在配置文件中[ spark.kafka.topics ]配置") topics.split(splitStr).filter(topic => StringUtils.isNotBlank(topic)).map(topic => topic.trim).toSet } /** * 获取webui地址 */ def getWebUI(spark: SparkSession): String = { val optConf = spark.conf.getOption("spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES") if (optConf.isDefined) { optConf.get } else { "" } } /** * 获取applicationId */ def getApplicationId: String = spark.sparkContext.applicationId /** * 获取spark版本号 */ def getVersion: String = org.apache.spark.SPARK_VERSION /** * 使用配置文件中的spark.streaming.batch.duration覆盖传参的batchDuration * * @param batchDuration * 代码中指定的批次时间 * @param hotRestart 是否热重启,热重启优先级最高 * @return * 被配置文件覆盖后的批次时间 */ def overrideBatchDuration(batchDuration: Long, hotRestart: Boolean): Long = { if (hotRestart) return batchDuration val confBathDuration = FireSparkConf.confBathDuration if (confBathDuration == -1) { batchDuration } else { Math.abs(confBathDuration) } } /** * 获取spark任务的webUI地址信息 * * @return */ def getUI(webUI: String): String = { val line = new StringBuilder() webUI.split(",").foreach(url => { line.append(StringsUtils.hrefTag(url) + StringsUtils.brTag("")) }) line.toString() } /** * 用于判断当前是否为executor * * @return true: executor false: driver */ def isExecutor: Boolean = { val executorId = this.getExecutorId if (StringUtils.isNotBlank(executorId) && !"driver".equalsIgnoreCase(executorId)) true else false } /** * 获取当前executor id * * @return * executor id或driver */ def getExecutorId: String = { if (SparkEnv.get != null) SparkEnv.get.executorId else "" } /** * 获取入口类名 */ def getMainClass: String = { if (SparkEnv.get != null) SparkEnv.get.conf.get(FireFrameworkConf.DRIVER_CLASS_NAME, "") else "" } /** * 用于判断当前是否为driver * * @return true: driver false: executor */ def isDriver: Boolean = { val label = this.getExecutorId if (StringUtils.isBlank(label) || "driver".equalsIgnoreCase(label)) true else false } /** * 是否是集群模式 * * @return * true: 集群模式 false:本地模式 */ def isCluster: Boolean = { OSUtils.isLinux } /** * 是否是本地模式 * * @return * true: 本地模式 false:集群模式 */ def isLocal: Boolean = { !isCluster } /** * 判断是否为yarn-client模式 * * @return * true: yarn-client模式 */ def isYarnClientMode: Boolean = { "client".equalsIgnoreCase(this.deployMode) } /** * 判断是否为yarn-cluster模式 * * @return * true: yarn-cluster模式 */ def isYarnClusterMode: Boolean = { "cluster".equalsIgnoreCase(this.deployMode) } /** * 获取spark任务运行模式 */ def deployMode: String = { if (this.isLocal) return "local" SparkSingletonFactory.getSparkSession.conf.get("spark.submit.deployMode") } /** * 优先从配置文件中获取配置信息,若获取不到,则从SparkEnv中获取 * * @param key * 配置的key * @param default * 配置为空则返回default * @return * 配置的value */ def getConf(key: String, default: String = ""): String = { var value = PropUtils.getString(key, default) if (StringUtils.isBlank(value) && SparkEnv.get != null) { value = SparkEnv.get.conf.get(key, default) } value } /** * 将指定的schema转为小写 * * @param schema * 转为小写的列 * @return * 转为小写的field数组 */ def schemaToLowerCase(schema: StructType): ArrayBuffer[String] = { val cols = ArrayBuffer[String]() schema.foreach(field => { val fieldName = field.name cols += (s"$fieldName as ${fieldName.toLowerCase}") }) cols } /** * 将内部row类型的DataFrame转为Row类型的DataFrame * * @param df * InternalRow类型的DataFrame * @return * Row类型的DataFrame */ def toExternalRow(df: DataFrame): DataFrame = { val schema = df.schema val mapedRowRDD = df.queryExecution.toRdd.mapPartitions { rows => val converter = CatalystTypeConverters.createToScalaConverter(schema) rows.map(converter(_).asInstanceOf[Row]) } SparkSingletonFactory.getSparkSession.createDataFrame(mapedRowRDD, schema) } /** * 从配置文件中读取并执行hive set的sql */ def executeHiveConfSQL(spark: SparkSession): Unit = { if (spark != null) { val confMap = FireHiveConf.hiveConfMap confMap.foreach(kv => spark.sql(s"set ${kv._1}=${kv._2}")) LogUtils.logMap(this.logger, confMap, "Execute hive sql conf.") } } /** * 分配次执行指定的业务逻辑 * * @param rdd * rdd.foreachPartition * @param batch * 多大批次执行一次sinkFun中定义的操作 * @param mapFun * 将Row类型映射为E类型的逻辑,并将处理后的数据放到listBuffer中 * @param sinkFun * 具体处理逻辑,将数据sink到目标源 */ def rddForeachPartitionBatch[T, E](rdd: RDD[T], mapFun: T => E, sinkFun: ListBuffer[E] => Unit, batch: Int = 1000): Unit = { rdd.foreachPartition(it => { var count: Int = 0 val list = ListBuffer[E]() it.foreach(row => { count += 1 val result = mapFun(row) if (result != null) list += result // 分批次执行 if (count == Math.abs(batch)) { sinkFun(list) count = 0 list.clear() } }) // 将剩余的数据一次执行掉 if (list.nonEmpty) { sinkFun(list) list.clear() } }) } /** * 分配次执行指定的业务逻辑 * * @param df * df.foreachPartition * @param batch * 多大批次执行一次sinkFun中定义的操作 * @param mapFun * 将Row类型映射为E类型的逻辑,并将处理后的数据放到listBuffer中 * @param sinkFun * 具体处理逻辑,将数据sink到目标源 */ def datasetForeachPartitionBatch[T, E](df: Dataset[T], mapFun: T => E, sinkFun: ListBuffer[E] => Unit, batch: Int = 1000): Unit = { df.foreachPartition((it: Iterator[T]) => { var count: Int = 0 val list = ListBuffer[E]() it.foreach(row => { count += 1 val result = mapFun(row) if (result != null) list += result // 分批次执行 if (count == Math.abs(batch)) { sinkFun(list) count = 0 list.clear() } }) // 将剩余的数据一次执行掉 if (list.nonEmpty) { sinkFun(list) list.clear() } }) } /** * 配置化spark DataSource api中的options选项,可通过配置文件方式读取并覆盖代码中指定相同的配置项 * * @param options * 可为空,如果为空,则必须在配置文件中指定 * @param keyNum * 用于区分多个数据源 */ def optionsEnhance(options: Map[String, String] = Map.empty, keyNum: Int = 1): Map[String, String] = { val map = collection.mutable.Map[String, String]() map ++= options map ++= PropUtils.sliceKeysByNum(FireSparkConf.SPARK_DATASOURCE_OPTIONS_PREFIX, keyNum) if (map.isEmpty) { throw new IllegalArgumentException(s"spark datasource options不能为空,请通过配置文件指定,以${FireSparkConf.SPARK_DATASOURCE_OPTIONS_PREFIX}为前缀,以${keyNum}为后缀.") } this.logger.info(s"--> Spark DataSource options信息(keyNum=$keyNum)<--") map.foreach(option => this.logger.info(s"${option._1} = ${option._2}")) map.toMap } } ================================================ FILE: fire-engines/fire-spark/src/main/scala/com/zto/fire.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto import com.zto.fire.core.ext.BaseFireExt import com.zto.fire.spark.ext.core._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, SQLContext, SparkSession} import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.{SparkConf, SparkContext} import scala.reflect.ClassTag /** * 预定义fire框架中的扩展工具 * * @author ChengLong * @since 1.0.0 * @create 2020-12-22 13:41 */ package object fire extends BaseFireExt { /** * SparkContext扩展 * * @param spark * sparkSession对象 */ implicit class SparkSessionExtBridge(spark: SparkSession) extends SparkSessionExt(spark) { } /** * SparkContext扩展 * * @param sc * SparkContext对象 */ implicit class SparkContextExtBridge(sc: SparkContext) extends SparkContextExt(sc) { } /** * RDD相关的扩展 * * @param rdd * rdd */ implicit class RDDExtBridge[T: ClassTag](rdd: RDD[T]) extends RDDExt[T](rdd) { } /** * SparkConf扩展 * * @param sparkConf * sparkConf对象 */ implicit class SparkConfExtBridge(sparkConf: SparkConf) extends SparkConfExt(sparkConf) { } /** * SQLContext与HiveContext扩展 * * @param sqlContext * sqlContext对象 */ implicit class SQLContextExtBridge(sqlContext: SQLContext) extends SQLContextExt(sqlContext) { } /** * DataFrame扩展 * * @param dataFrame * dataFrame实例 */ implicit class DataFrameExtBridge(dataFrame: DataFrame) extends DataFrameExt(dataFrame) { } /** * Dataset扩展 * * @param dataset * dataset对象 */ implicit class DatasetExtBridge[T: ClassTag](dataset: Dataset[T]) extends DatasetExt[T](dataset) { } /** * StreamingContext扩展 * * @param ssc * StreamingContext对象 */ implicit class StreamingContextExtBridge(ssc: StreamingContext) extends StreamingContextExt(ssc){ } /** * DStream扩展 * * @param stream * stream对象 */ implicit class DStreamExtBridge[T: ClassTag](stream: DStream[T]) extends DStreamExt[T](stream) { } } ================================================ FILE: fire-engines/fire-spark/src/main/scala-spark-2.3/com.zto.fire.spark.sql/SparkSqlExtensionsParser.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.sql import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.types.{DataType, StructType} /** * Spark Sql解析扩展,用于拦截执行的sql以及解析sql中的血缘 * * @author ChengLong 2021-6-23 10:25:17 * @since 2.0.0 */ private[fire] class SparkSqlExtensionsParser(sparkSession: SparkSession, parser: ParserInterface) extends SparkSqlExtensionsParserBase(sparkSession, parser) with ParserInterface { override def parseTableIdentifier(sqlText: String): TableIdentifier = { parser.parseTableIdentifier(sqlText) } override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = { parser.parseFunctionIdentifier(sqlText) } override def parseTableSchema(sqlText: String): StructType = { parser.parseTableSchema(sqlText) } override def parseDataType(sqlText: String): DataType = { parser.parseDataType(sqlText) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala-spark-2.3/com.zto.fire.spark.sql/SparkSqlParser.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.sql import com.zto.fire.common.bean.TableIdentifier import com.zto.fire.common.enu.Operation import com.zto.fire.common.util.SQLLineageManager import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources.CreateTable /** * Spark SQL解析器,用于解析Spark SQL语句中的库、表、分区、操作类型等信息 * * @author ChengLong 2021-6-18 16:31:04 * @since 2.0.0 */ private[fire] object SparkSqlParser extends SparkSqlParserBase { /** * 用于解析查询sql中的库表信息 * * @param sinkTable * 当insert xxx select或create xxx select语句时,sinkTable不为空 */ override def queryParser(logicalPlan: LogicalPlan, sinkTable: Option[TableIdentifier]): Unit = { logicalPlan.children.foreach(child => { this.queryParser(child, sinkTable) var sourceTable: Option[TableIdentifier] = None child match { case unresolvedRelation: UnresolvedRelation => val tableIdentifier = toFireTableIdentifier(unresolvedRelation.tableIdentifier) this.addCatalog(tableIdentifier, Operation.SELECT) sourceTable = Some(tableIdentifier) // 如果是insert xxx select或create xxx select语句,则维护表与表之间的关系 if (sinkTable.isDefined) SQLLineageManager.addRelation(tableIdentifier, sinkTable.get) case _ => this.logger.debug(s"Parse query SQL异常,无法匹配该Statement. ") } }) } /** * 用于解析DDL语句中的库表、分区信息 * * @return 返回sink目标表,用于维护表与表之间的关系 */ override def ddlParser(logicalPlan: LogicalPlan): Option[TableIdentifier] = { var sinkTable: Option[TableIdentifier] = None logicalPlan match { // insert into语句解析 case insertInto: InsertIntoTable => { val identifier = this.toFireTableIdentifier(insertInto.table.asInstanceOf[UnresolvedRelation].tableIdentifier) this.addCatalog(identifier, Operation.INSERT_INTO) // 维护分区信息 val partitions = insertInto.partition.map(part => (part._1, if (part._2.isDefined) part._2.get else "")) SQLLineageManager.setPartitions(identifier, partitions.toSeq) sinkTable = Some(identifier) } // drop table语句解析 case dropTable: DropTableCommand => this.addCatalog(this.toFireTableIdentifier(dropTable.tableName), Operation.DROP_TABLE) // rename table语句解析 case renameTableEvent: AlterTableRenameCommand => val tableIdentifier = toFireTableIdentifier(renameTableEvent.oldName) val newTableIdentifier = toFireTableIdentifier(renameTableEvent.newName) this.addCatalog(tableIdentifier, Operation.RENAME_TABLE_OLD) this.addCatalog(newTableIdentifier, Operation.RENAME_TABLE_NEW) SQLLineageManager.addRelation(tableIdentifier, newTableIdentifier) // create table语句解析 case createTable: CreateTable => { val identifier = this.toFireTableIdentifier(createTable.tableDesc.identifier) this.addCatalog(identifier, Operation.CREATE_TABLE) sinkTable = Some(identifier) // 采集建表属性信息 SQLLineageManager.setOptions(identifier, createTable.tableDesc.properties) // 采集分区字段信息 val partitions = createTable.tableDesc.partitionSchema.map(st => (st.dataType.toString, st.name)) SQLLineageManager.setPartitions(identifier, partitions) } // rename partition语句解析 case renamePartition: AlterTableRenamePartitionCommand => { val tableIdentifier = this.toFireTableIdentifier(renamePartition.tableName) this.addCatalog(tableIdentifier, Operation.RENAME_PARTITION_OLD) this.addCatalog(tableIdentifier, Operation.RENAME_PARTITION_NEW) SQLLineageManager.setPartitions(tableIdentifier, renamePartition.oldPartition.toSeq) SQLLineageManager.setPartitions(tableIdentifier, renamePartition.newPartition.toSeq) } // drop partition语句解析 case dropPartition: AlterTableDropPartitionCommand => { val tableIdentifier = this.toFireTableIdentifier(dropPartition.tableName) this.addCatalog(tableIdentifier, Operation.DROP_PARTITION) SQLLineageManager.setPartitions(tableIdentifier, dropPartition.specs.head.toSeq) } // add partition语句解析 case addPartition: AlterTableAddPartitionCommand => { val tableIdentifier = this.toFireTableIdentifier(addPartition.tableName) this.addCatalog(tableIdentifier, Operation.ADD_PARTITION) SQLLineageManager.setPartitions(tableIdentifier, addPartition.partitionSpecsAndLocs.head._1.toSeq) } // truncate table语句解析 case truncateTable: TruncateTableCommand => { val tableIdentifier = this.toFireTableIdentifier(truncateTable.tableName) this.addCatalog(tableIdentifier, Operation.TRUNCATE) } case cacheTable: CacheTableCommand => { val tableIdentifier = this.toFireTableIdentifier(cacheTable.tableIdent) this.addCatalog(tableIdentifier, Operation.CACHE) } case uncacheTable: UncacheTableCommand => { val tableIdentifier = this.toFireTableIdentifier(uncacheTable.tableIdent) this.addCatalog(tableIdentifier, Operation.UNCACHE) } case _ => this.logger.debug(s"Parse ddl SQL异常,无法匹配该Statement.") } sinkTable } } ================================================ FILE: fire-engines/fire-spark/src/main/scala-spark-2.4/com.zto.fire.spark.sql/SparkSqlExtensionsParser.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.sql import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.types.{DataType, StructType} /** * Spark Sql解析扩展,用于拦截执行的sql以及解析sql中的血缘 * * @author ChengLong 2021-6-23 10:25:17 * @since 2.0.0 */ private[fire] class SparkSqlExtensionsParser(sparkSession: SparkSession, parser: ParserInterface) extends SparkSqlExtensionsParserBase(sparkSession, parser) with ParserInterface { override def parseTableIdentifier(sqlText: String): TableIdentifier = { parser.parseTableIdentifier(sqlText) } override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = { parser.parseFunctionIdentifier(sqlText) } override def parseTableSchema(sqlText: String): StructType = { parser.parseTableSchema(sqlText) } override def parseDataType(sqlText: String): DataType = { parser.parseDataType(sqlText) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala-spark-2.4/com.zto.fire.spark.sql/SparkSqlParser.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.sql import com.zto.fire.common.bean.TableIdentifier import com.zto.fire.common.enu.Operation import com.zto.fire.common.util.SQLLineageManager import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources.CreateTable /** * Spark SQL解析器,用于解析Spark SQL语句中的库、表、分区、操作类型等信息 * * @author ChengLong 2021-6-18 16:31:04 * @since 2.0.0 */ private[fire] object SparkSqlParser extends SparkSqlParserBase { /** * 用于解析查询sql中的库表信息 * * @param sinkTable * 当insert xxx select或create xxx select语句时,sinkTable不为空 */ override def queryParser(logicalPlan: LogicalPlan, sinkTable: Option[TableIdentifier]): Unit = { logicalPlan.children.foreach(child => { this.queryParser(child, sinkTable) var sourceTable: Option[TableIdentifier] = None child match { case unresolvedRelation: UnresolvedRelation => val tableIdentifier = toFireTableIdentifier(unresolvedRelation.tableIdentifier) this.addCatalog(tableIdentifier, Operation.SELECT) sourceTable = Some(tableIdentifier) // 如果是insert xxx select或create xxx select语句,则维护表与表之间的关系 if (sinkTable.isDefined) SQLLineageManager.addRelation(tableIdentifier, sinkTable.get) case _ => this.logger.debug(s"Parse query SQL异常,无法匹配该Statement. ") } }) } /** * 用于解析DDL语句中的库表、分区信息 * * @return 返回sink目标表,用于维护表与表之间的关系 */ override def ddlParser(logicalPlan: LogicalPlan): Option[TableIdentifier] = { var sinkTable: Option[TableIdentifier] = None logicalPlan match { // insert into语句解析 case insertInto: InsertIntoTable => { val identifier = this.toFireTableIdentifier(insertInto.table.asInstanceOf[UnresolvedRelation].tableIdentifier) this.addCatalog(identifier, Operation.INSERT_INTO) // 维护分区信息 val partitions = insertInto.partition.map(part => (part._1, if (part._2.isDefined) part._2.get else "")) SQLLineageManager.setPartitions(identifier, partitions.toSeq) sinkTable = Some(identifier) } // drop table语句解析 case dropTable: DropTableCommand => this.addCatalog(this.toFireTableIdentifier(dropTable.tableName), Operation.DROP_TABLE) // rename table语句解析 case renameTableEvent: AlterTableRenameCommand => val tableIdentifier = toFireTableIdentifier(renameTableEvent.oldName) val newTableIdentifier = toFireTableIdentifier(renameTableEvent.newName) this.addCatalog(tableIdentifier, Operation.RENAME_TABLE_OLD) this.addCatalog(newTableIdentifier, Operation.RENAME_TABLE_NEW) SQLLineageManager.addRelation(tableIdentifier, newTableIdentifier) // create table语句解析 case createTable: CreateTable => { val identifier = this.toFireTableIdentifier(createTable.tableDesc.identifier) this.addCatalog(identifier, Operation.CREATE_TABLE) sinkTable = Some(identifier) // 采集建表属性信息 SQLLineageManager.setOptions(identifier, createTable.tableDesc.properties) // 采集分区字段信息 val partitions = createTable.tableDesc.partitionSchema.map(st => (st.dataType.toString, st.name)) SQLLineageManager.setPartitions(identifier, partitions) } // rename partition语句解析 case renamePartition: AlterTableRenamePartitionCommand => { val tableIdentifier = this.toFireTableIdentifier(renamePartition.tableName) this.addCatalog(tableIdentifier, Operation.RENAME_PARTITION_OLD) this.addCatalog(tableIdentifier, Operation.RENAME_PARTITION_NEW) SQLLineageManager.setPartitions(tableIdentifier, renamePartition.oldPartition.toSeq) SQLLineageManager.setPartitions(tableIdentifier, renamePartition.newPartition.toSeq) } // drop partition语句解析 case dropPartition: AlterTableDropPartitionCommand => { val tableIdentifier = this.toFireTableIdentifier(dropPartition.tableName) this.addCatalog(tableIdentifier, Operation.DROP_PARTITION) SQLLineageManager.setPartitions(tableIdentifier, dropPartition.specs.head.toSeq) } // add partition语句解析 case addPartition: AlterTableAddPartitionCommand => { val tableIdentifier = this.toFireTableIdentifier(addPartition.tableName) this.addCatalog(tableIdentifier, Operation.ADD_PARTITION) SQLLineageManager.setPartitions(tableIdentifier, addPartition.partitionSpecsAndLocs.head._1.toSeq) } // truncate table语句解析 case truncateTable: TruncateTableCommand => { val tableIdentifier = this.toFireTableIdentifier(truncateTable.tableName) this.addCatalog(tableIdentifier, Operation.TRUNCATE) } case cacheTable: CacheTableCommand => { val tableIdentifier = this.toFireTableIdentifier(cacheTable.tableIdent) this.addCatalog(tableIdentifier, Operation.CACHE) } case uncacheTable: UncacheTableCommand => { val tableIdentifier = this.toFireTableIdentifier(uncacheTable.tableIdent) this.addCatalog(tableIdentifier, Operation.UNCACHE) } case _ => this.logger.debug(s"Parse ddl SQL异常,无法匹配该Statement.") } sinkTable } } ================================================ FILE: fire-engines/fire-spark/src/main/scala-spark-3.0/com/zto/fire/spark/sql/SparkSqlExtensionsParser.scala ================================================ package com.zto.fire.spark.sql import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.types.{DataType, StructType} /** * Spark Sql解析扩展,用于拦截执行的sql以及解析sql中的血缘 * * @author ChengLong 2021-6-23 10:25:17 * @since 2.0.0 */ private[fire] class SparkSqlExtensionsParser(sparkSession: SparkSession, parser: ParserInterface) extends SparkSqlExtensionsParserBase(sparkSession, parser) with ParserInterface { /** * Parse a string to a [[TableIdentifier]]. */ override def parseTableIdentifier(sqlText: String): TableIdentifier = parser.parseTableIdentifier(sqlText) /** * Parse a string to a [[FunctionIdentifier]]. */ override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = parser.parseFunctionIdentifier(sqlText) /** * Parse a string to a [[StructType]]. The passed SQL string should be a comma separated * list of field definitions which will preserve the correct Hive metadata. */ override def parseTableSchema(sqlText: String): StructType = parser.parseTableSchema(sqlText) /** * Parse a string to a [[DataType]]. */ override def parseDataType(sqlText: String): DataType = parser.parseDataType(sqlText) /** * Parse a string to a multi-part identifier. */ override def parseMultipartIdentifier(sqlText: String): Seq[String] = parser.parseMultipartIdentifier(sqlText) /** * Parse a string to a raw [[DataType]] without CHAR/VARCHAR replacement. */ override def parseRawDataType(sqlText: String): DataType = parser.parseRawDataType(sqlText) } ================================================ FILE: fire-engines/fire-spark/src/main/scala-spark-3.0/com/zto/fire/spark/sql/SparkSqlParser.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.sql import com.zto.fire.common.anno.Internal import com.zto.fire.common.bean.TableIdentifier import com.zto.fire.common.enu.Operation import com.zto.fire.common.util.SQLLineageManager import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.datasources.CreateTable /** * Spark SQL解析器,用于解析Spark SQL语句中的库、表、分区、操作类型等信息 * * @author ChengLong 2021-6-18 16:31:04 * @since 2.0.0 */ @Internal private[fire] object SparkSqlParser extends SparkSqlParserBase { /** * 用于解析查询sql中的库表信息 * * @param sinkTable * 当insert xxx select或create xxx select语句时,sinkTable不为空 */ override def queryParser(logicalPlan: LogicalPlan, sinkTable: Option[TableIdentifier]): Unit = { logicalPlan.children.foreach(child => { this.queryParser(child, sinkTable) var sourceTable: Option[TableIdentifier] = None child match { case unresolvedRelation: UnresolvedRelation => this.addCatalog(unresolvedRelation.multipartIdentifier, Operation.SELECT) sourceTable = Some(toTableIdentifier(unresolvedRelation.multipartIdentifier)) // 如果是insert xxx select或create xxx select语句,则维护表与表之间的关系 if (sinkTable.isDefined) SQLLineageManager.addRelation(toTableIdentifier(unresolvedRelation.multipartIdentifier), sinkTable.get) case _ => this.logger.debug(s"Parse query SQL异常,无法匹配该Statement. ") } }) } /** * 用于解析DDL语句中的库表、分区信息 * * @return 返回sink目标表,用于维护表与表之间的关系 */ override def ddlParser(logicalPlan: LogicalPlan): Option[TableIdentifier] = { var sinkTable: Option[TableIdentifier] = None logicalPlan match { // insert into语句解析 case insertInto: InsertIntoStatement => { val identifier = insertInto.table.asInstanceOf[UnresolvedRelation].multipartIdentifier this.addCatalog(identifier, Operation.INSERT_INTO) // 维护分区信息 val fireTableIdentifier = toTableIdentifier(identifier) val partitions = insertInto.partitionSpec.map(part => (part._1, if (part._2.isDefined) part._2.get else "")) SQLLineageManager.setPartitions(fireTableIdentifier, partitions.toSeq) sinkTable = Some(fireTableIdentifier) } // drop table语句解析 case dropTable: DropTableStatement => { this.addCatalog(dropTable.tableName, Operation.DROP_TABLE) } // rename table语句解析 case renameTable: RenameTableStatement => { this.addCatalog(renameTable.oldName, Operation.RENAME_TABLE_OLD) this.addCatalog(renameTable.newName, Operation.RENAME_TABLE_NEW) SQLLineageManager.addRelation(toTableIdentifier(renameTable.oldName), toTableIdentifier(renameTable.newName)) } // create table as select语句解析 case createTableAsSelect: CreateTableAsSelectStatement => { val identifier = this.toTableIdentifier(createTableAsSelect.tableName) this.addCatalog(identifier, Operation.CREATE_TABLE_AS_SELECT) // 采集建表属性信息 SQLLineageManager.setOptions(identifier, createTableAsSelect.properties) sinkTable = Some(identifier) } // create table语句解析 case createTable: CreateTable => { val identifier = this.toFireTableIdentifier(createTable.tableDesc.identifier) this.addCatalog(identifier, Operation.CREATE_TABLE) sinkTable = Some(identifier) // 采集建表属性信息 SQLLineageManager.setOptions(identifier, createTable.tableDesc.properties) // 采集分区字段信息 val partitions = createTable.tableDesc.partitionSchema.map(st => (st.dataType.toString, st.name)) SQLLineageManager.setPartitions(identifier, partitions) } // rename partition语句解析 case renamePartition: AlterTableRenamePartitionStatement => { this.addCatalog(renamePartition.tableName, Operation.RENAME_PARTITION_OLD) this.addCatalog(renamePartition.tableName, Operation.RENAME_PARTITION_NEW) SQLLineageManager.setPartitions(this.toTableIdentifier(renamePartition.tableName), renamePartition.from.toSeq) SQLLineageManager.setPartitions(this.toTableIdentifier(renamePartition.tableName), renamePartition.to.toSeq) } // drop partition语句解析 case dropPartition: AlterTableDropPartitionStatement => { this.addCatalog(dropPartition.tableName, Operation.DROP_PARTITION) SQLLineageManager.setPartitions(this.toTableIdentifier(dropPartition.tableName), dropPartition.specs.head.toSeq) } // add partition语句解析 case addPartition: AlterTableAddPartitionStatement => { this.addCatalog(addPartition.tableName, Operation.ADD_PARTITION) SQLLineageManager.setPartitions(this.toTableIdentifier(addPartition.tableName), addPartition.partitionSpecsAndLocs.head._1.toSeq) } // truncate table语句解析 case truncateTable: TruncateTableStatement => { this.addCatalog(truncateTable.tableName, Operation.TRUNCATE) } case cacheTable: CacheTableStatement => { this.addCatalog(cacheTable.tableName, Operation.CACHE) } case uncacheTable: UncacheTableStatement => { this.addCatalog(uncacheTable.tableName, Operation.UNCACHE) } case refreshTable: RefreshTableStatement => { this.addCatalog(refreshTable.tableName, Operation.REFRESH) } case _ => this.logger.debug(s"Parse ddl SQL异常,无法匹配该Statement.") } sinkTable } } ================================================ FILE: fire-engines/fire-spark/src/main/scala-spark-3.1/com/zto/fire/spark/sql/SparkSqlExtensionsParser.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.sql import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.types.{DataType, StructType} /** * Spark Sql解析扩展,用于拦截执行的sql以及解析sql中的血缘 * * @author ChengLong 2021-6-23 10:25:17 * @since 2.0.0 */ private[fire] class SparkSqlExtensionsParser(sparkSession: SparkSession, parser: ParserInterface) extends SparkSqlExtensionsParserBase(sparkSession, parser) with ParserInterface { /** * Parse a string to a [[TableIdentifier]]. */ override def parseTableIdentifier(sqlText: String): TableIdentifier = parser.parseTableIdentifier(sqlText) /** * Parse a string to a [[FunctionIdentifier]]. */ override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = parser.parseFunctionIdentifier(sqlText) /** * Parse a string to a [[StructType]]. The passed SQL string should be a comma separated * list of field definitions which will preserve the correct Hive metadata. */ override def parseTableSchema(sqlText: String): StructType = parser.parseTableSchema(sqlText) /** * Parse a string to a [[DataType]]. */ override def parseDataType(sqlText: String): DataType = parser.parseDataType(sqlText) /** * Parse a string to a multi-part identifier. */ override def parseMultipartIdentifier(sqlText: String): Seq[String] = parser.parseMultipartIdentifier(sqlText) } ================================================ FILE: fire-engines/fire-spark/src/main/scala-spark-3.1/com/zto/fire/spark/sql/SparkSqlParser.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.sql import com.zto.fire.common.bean.TableIdentifier import com.zto.fire.common.enu.Operation import com.zto.fire.common.util.SQLLineageManager import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.command.{CacheTableCommand, UncacheTableCommand} import org.apache.spark.sql.execution.datasources.CreateTable /** * Spark SQL解析器,用于解析Spark SQL语句中的库、表、分区、操作类型等信息 * * @author ChengLong 2021-6-18 16:31:04 * @since 2.0.0 */ private[fire] object SparkSqlParser extends SparkSqlParserBase { /** * 用于解析查询sql中的库表信息 * * @param sinkTable * 当insert xxx select或create xxx select语句时,sinkTable不为空 */ override def queryParser(logicalPlan: LogicalPlan, sinkTable: Option[TableIdentifier]): Unit = { logicalPlan.children.foreach(child => { this.queryParser(child, sinkTable) var sourceTable: Option[TableIdentifier] = None child match { case unresolvedRelation: UnresolvedRelation => this.addCatalog(unresolvedRelation.multipartIdentifier, Operation.SELECT) sourceTable = Some(toTableIdentifier(unresolvedRelation.multipartIdentifier)) // 如果是insert xxx select或create xxx select语句,则维护表与表之间的关系 if (sinkTable.isDefined) SQLLineageManager.addRelation(toTableIdentifier(unresolvedRelation.multipartIdentifier), sinkTable.get) case _ => this.logger.debug(s"Parse query SQL异常,无法匹配该Statement. ") } }) } /** * 用于解析DDL语句中的库表、分区信息 * * @return 返回sink目标表,用于维护表与表之间的关系 */ override def ddlParser(logicalPlan: LogicalPlan): Option[TableIdentifier] = { var sinkTable: Option[TableIdentifier] = None logicalPlan match { // insert into语句解析 case insertInto: InsertIntoStatement => { val identifier = insertInto.table.asInstanceOf[UnresolvedRelation].multipartIdentifier this.addCatalog(identifier, Operation.INSERT_INTO) // 维护分区信息 val fireTableIdentifier = toTableIdentifier(identifier) val partitions = insertInto.partitionSpec.map(part => (part._1, if (part._2.isDefined) part._2.get else "")) SQLLineageManager.setPartitions(fireTableIdentifier, partitions.toSeq) sinkTable = Some(fireTableIdentifier) } // rename table语句解析 case renameTable: RenameTableStatement => { this.addCatalog(renameTable.oldName, Operation.RENAME_TABLE_OLD) this.addCatalog(renameTable.newName, Operation.RENAME_TABLE_NEW) SQLLineageManager.addRelation(toTableIdentifier(renameTable.oldName), toTableIdentifier(renameTable.newName)) } // create table as select语句解析 case createTableAsSelect: CreateTableAsSelectStatement => { val identifier = this.toTableIdentifier(createTableAsSelect.tableName) this.addCatalog(identifier, Operation.CREATE_TABLE_AS_SELECT) // 采集建表属性信息 SQLLineageManager.setOptions(identifier, createTableAsSelect.properties) sinkTable = Some(identifier) } // create table语句解析 case createTable: CreateTable => { val identifier = this.toFireTableIdentifier(createTable.tableDesc.identifier) this.addCatalog(identifier, Operation.CREATE_TABLE) sinkTable = Some(identifier) // 采集建表属性信息 SQLLineageManager.setOptions(identifier, createTable.tableDesc.properties) // 采集分区字段信息 val partitions = createTable.tableDesc.partitionSchema.map(st => (st.dataType.toString, st.name)) SQLLineageManager.setPartitions(identifier, partitions) } // rename partition语句解析 case renamePartition: AlterTableRenamePartitionStatement => { this.addCatalog(renamePartition.tableName, Operation.RENAME_PARTITION_OLD) this.addCatalog(renamePartition.tableName, Operation.RENAME_PARTITION_NEW) SQLLineageManager.setPartitions(this.toTableIdentifier(renamePartition.tableName), renamePartition.from.toSeq) SQLLineageManager.setPartitions(this.toTableIdentifier(renamePartition.tableName), renamePartition.to.toSeq) } case _ => this.logger.debug(s"Parse ddl SQL异常,无法匹配该Statement.") } sinkTable } } ================================================ FILE: fire-engines/fire-spark/src/main/scala-spark-3.2/com/zto/fire/spark/sql/SparkSqlExtensionsParser.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.sql import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.types.{DataType, StructType} /** * Spark Sql解析扩展,用于拦截执行的sql以及解析sql中的血缘 * * @author ChengLong 2021-6-23 10:25:17 * @since 2.0.0 */ private[fire] class SparkSqlExtensionsParser(sparkSession: SparkSession, parser: ParserInterface) extends SparkSqlExtensionsParserBase(sparkSession, parser) with ParserInterface { override def parseTableIdentifier(sqlText: String): TableIdentifier = { parser.parseTableIdentifier(sqlText) } override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = { parser.parseFunctionIdentifier(sqlText) } override def parseMultipartIdentifier(sqlText: String): Seq[String] = { parser.parseMultipartIdentifier(sqlText) } override def parseTableSchema(sqlText: String): StructType = { parser.parseTableSchema(sqlText) } override def parseDataType(sqlText: String): DataType = { parser.parseDataType(sqlText) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala-spark-3.2/com/zto/fire/spark/sql/SparkSqlParser.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.sql import com.zto.fire.common.bean.TableIdentifier import com.zto.fire.common.enu.Operation import com.zto.fire.common.util.SQLLineageManager import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.command.AlterTableRenamePartitionCommand import org.apache.spark.sql.execution.datasources.CreateTable /** * Spark SQL解析器,用于解析Spark SQL语句中的库、表、分区、操作类型等信息 * * @author ChengLong 2021-6-18 16:31:04 * @since 2.0.0 */ private[fire] object SparkSqlParser extends SparkSqlParserBase { /** * 用于解析查询sql中的库表信息 * * @param sinkTable * 当insert xxx select或create xxx select语句时,sinkTable不为空 */ override def queryParser(logicalPlan: LogicalPlan, sinkTable: Option[TableIdentifier]): Unit = { logicalPlan.children.foreach(child => { this.queryParser(child, sinkTable) var sourceTable: Option[TableIdentifier] = None child match { case unresolvedRelation: UnresolvedRelation => this.addCatalog(unresolvedRelation.multipartIdentifier, Operation.SELECT) sourceTable = Some(toTableIdentifier(unresolvedRelation.multipartIdentifier)) // 如果是insert xxx select或create xxx select语句,则维护表与表之间的关系 if (sinkTable.isDefined) SQLLineageManager.addRelation(toTableIdentifier(unresolvedRelation.multipartIdentifier), sinkTable.get) case _ => this.logger.debug(s"Parse query SQL异常,无法匹配该Statement. ") } }) } /** * 用于解析DDL语句中的库表、分区信息 * * @return 返回sink目标表,用于维护表与表之间的关系 */ override def ddlParser(logicalPlan: LogicalPlan): Option[TableIdentifier] = { var sinkTable: Option[TableIdentifier] = None logicalPlan match { // insert into语句解析 case insertInto: InsertIntoStatement => { val identifier = insertInto.table.asInstanceOf[UnresolvedRelation].multipartIdentifier this.addCatalog(identifier, Operation.INSERT_INTO) // 维护分区信息 val fireTableIdentifier = toTableIdentifier(identifier) val partitions = insertInto.partitionSpec.map(part => (part._1, if (part._2.isDefined) part._2.get else "")) SQLLineageManager.setPartitions(fireTableIdentifier, partitions.toSeq) sinkTable = Some(fireTableIdentifier) } // rename table语句解析 case renameTable: RenameTable => { this.addCatalog(renameTable.newName, Operation.RENAME_TABLE_OLD) this.addCatalog(renameTable.newName, Operation.RENAME_TABLE_NEW) SQLLineageManager.addRelation(toTableIdentifier(renameTable.newName), toTableIdentifier(renameTable.newName)) } // create table as select语句解析 case createTableAsSelect: CreateTableAsSelectStatement => { val identifier = this.toTableIdentifier(createTableAsSelect.tableName) this.addCatalog(identifier, Operation.CREATE_TABLE_AS_SELECT) // 采集建表属性信息 SQLLineageManager.setOptions(identifier, createTableAsSelect.properties) sinkTable = Some(identifier) } // create table语句解析 case createTable: CreateTable => { val identifier = this.toFireTableIdentifier(createTable.tableDesc.identifier) this.addCatalog(identifier, Operation.CREATE_TABLE) sinkTable = Some(identifier) // 采集建表属性信息 SQLLineageManager.setOptions(identifier, createTable.tableDesc.properties) // 采集分区字段信息 val partitions = createTable.tableDesc.partitionSchema.map(st => (st.dataType.toString, st.name)) SQLLineageManager.setPartitions(identifier, partitions) } // rename partition语句解析 case renamePartition: AlterTableRenamePartitionCommand => { val oldTable = this.toFireTableIdentifier(renamePartition.tableName) val newTable = this.toFireTableIdentifier(renamePartition.tableName) this.addCatalog(oldTable, Operation.RENAME_PARTITION_OLD) this.addCatalog(newTable, Operation.RENAME_PARTITION_NEW) SQLLineageManager.setPartitions(oldTable, renamePartition.oldPartition.toSeq) SQLLineageManager.setPartitions(newTable, renamePartition.newPartition.toSeq) } case _ => this.logger.debug(s"Parse ddl SQL异常,无法匹配该Statement.") } sinkTable } } ================================================ FILE: fire-engines/fire-spark/src/main/scala-spark-3.3/com/zto/fire/spark/sql/SparkSqlExtensionsParser.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.sql import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.types.{DataType, StructType} /** * Spark Sql解析扩展,用于拦截执行的sql以及解析sql中的血缘 * * @author ChengLong 2021-6-23 10:25:17 * @since 2.0.0 */ private[fire] class SparkSqlExtensionsParser(sparkSession: SparkSession, parser: ParserInterface) extends SparkSqlExtensionsParserBase(sparkSession, parser) with ParserInterface { override def parseTableIdentifier(sqlText: String): TableIdentifier = { parser.parseTableIdentifier(sqlText) } override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = { parser.parseFunctionIdentifier(sqlText) } override def parseMultipartIdentifier(sqlText: String): Seq[String] = { parser.parseMultipartIdentifier(sqlText) } override def parseTableSchema(sqlText: String): StructType = { parser.parseTableSchema(sqlText) } override def parseDataType(sqlText: String): DataType = { parser.parseDataType(sqlText) } override def parseQuery(sqlText: String): LogicalPlan = { parser.parseQuery(sqlText) } } ================================================ FILE: fire-engines/fire-spark/src/main/scala-spark-3.3/com/zto/fire/spark/sql/SparkSqlParser.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.spark.sql import com.zto.fire.common.bean.TableIdentifier import com.zto.fire.common.enu.Operation import com.zto.fire.common.util.SQLLineageManager import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.command.AlterTableRenamePartitionCommand import org.apache.spark.sql.execution.datasources.CreateTable /** * Spark SQL解析器,用于解析Spark SQL语句中的库、表、分区、操作类型等信息 * * @author ChengLong 2021-6-18 16:31:04 * @since 2.0.0 */ private[fire] object SparkSqlParser extends SparkSqlParserBase { /** * 用于解析查询sql中的库表信息 * * @param sinkTable * 当insert xxx select或create xxx select语句时,sinkTable不为空 */ override def queryParser(logicalPlan: LogicalPlan, sinkTable: Option[TableIdentifier]): Unit = { logicalPlan.children.foreach(child => { this.queryParser(child, sinkTable) var sourceTable: Option[TableIdentifier] = None child match { case unresolvedRelation: UnresolvedRelation => this.addCatalog(unresolvedRelation.multipartIdentifier, Operation.SELECT) sourceTable = Some(toTableIdentifier(unresolvedRelation.multipartIdentifier)) // 如果是insert xxx select或create xxx select语句,则维护表与表之间的关系 if (sinkTable.isDefined) SQLLineageManager.addRelation(toTableIdentifier(unresolvedRelation.multipartIdentifier), sinkTable.get) case _ => this.logger.debug(s"Parse query SQL异常,无法匹配该Statement. ") } }) } /** * 用于解析DDL语句中的库表、分区信息 * * @return 返回sink目标表,用于维护表与表之间的关系 */ override def ddlParser(logicalPlan: LogicalPlan): Option[TableIdentifier] = { var sinkTable: Option[TableIdentifier] = None logicalPlan match { // insert into语句解析 case insertInto: InsertIntoStatement => { val identifier = insertInto.table.asInstanceOf[UnresolvedRelation].multipartIdentifier this.addCatalog(identifier, Operation.INSERT_INTO) // 维护分区信息 val fireTableIdentifier = toTableIdentifier(identifier) val partitions = insertInto.partitionSpec.map(part => (part._1, if (part._2.isDefined) part._2.get else "")) SQLLineageManager.setPartitions(fireTableIdentifier, partitions.toSeq) sinkTable = Some(fireTableIdentifier) } // rename table语句解析 case renameTable: RenameTable => { this.addCatalog(renameTable.newName, Operation.RENAME_TABLE_OLD) this.addCatalog(renameTable.newName, Operation.RENAME_TABLE_NEW) SQLLineageManager.addRelation(toTableIdentifier(renameTable.newName), toTableIdentifier(renameTable.newName)) } // create table as select语句解析 case createTableAsSelect: CreateTableAsSelect => { val identifier = TableIdentifier(createTableAsSelect.tableName.name()) this.addCatalog(identifier, Operation.CREATE_TABLE_AS_SELECT) // 采集建表属性信息 SQLLineageManager.setOptions(identifier, createTableAsSelect.writeOptions) sinkTable = Some(identifier) } // create table语句解析 case createTable: CreateTable => { val identifier = this.toFireTableIdentifier(createTable.tableDesc.identifier) this.addCatalog(identifier, Operation.CREATE_TABLE) sinkTable = Some(identifier) // 采集建表属性信息 SQLLineageManager.setOptions(identifier, createTable.tableDesc.properties) // 采集分区字段信息 val partitions = createTable.tableDesc.partitionSchema.map(st => (st.dataType.toString, st.name)) SQLLineageManager.setPartitions(identifier, partitions) } // rename partition语句解析 case renamePartition: AlterTableRenamePartitionCommand => { val table = this.toFireTableIdentifier(renamePartition.tableName) this.addCatalog(table, Operation.RENAME_PARTITION_OLD) this.addCatalog(table, Operation.RENAME_PARTITION_NEW) SQLLineageManager.setPartitions(table, renamePartition.oldPartition.toSeq) SQLLineageManager.setPartitions(table, renamePartition.newPartition.toSeq) } case _ => this.logger.debug(s"Parse ddl SQL异常,无法匹配该Statement.") } sinkTable } } ================================================ FILE: fire-engines/pom.xml ================================================ 4.0.0 fire-engines pom Fire : Engines : com.zto.fire fire-parent 2.3.2-SNAPSHOT ../pom.xml fire-spark fire-flink com.zto.fire fire-common_${scala.binary.version} ${fire.version} ${maven.scope} com.zto.fire fire-core_${scala.binary.version} ${fire.version} ${maven.scope} com.zto.fire fire-connector-jdbc_${scala.binary.version} ${fire.version} ${maven.scope} com.zto.fire fire-connector-hbase_${scala.binary.version} ${fire.version} ${maven.scope} com.zto.fire fire-metrics_${scala.binary.version} ${fire.version} ${maven.scope} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-enhance/apache-arthas/pom.xml ================================================ 4.0.0 fire-enhance-arthas_${scala.binary.version} 2.3.2-SNAPSHOT jar Fire : Enhance : Arthas com.zto.fire fire-enhance 2.3.2-SNAPSHOT ../pom.xml com.taobao.arthas arthas-agent-attach ${arthas.version} com.taobao.arthas arthas-packaging ${arthas.version} com.taobao.arthas arthas-core ${arthas.version} com.taobao.arthas arthas-spy ${arthas.version} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-enhance/apache-arthas/src/main/java/com/taobao/arthas/agent/attach/ArthasAgent.java ================================================ package com.taobao.arthas.agent.attach; import net.bytebuddy.agent.ByteBuddyAgent; import org.zeroturnaround.zip.ZipUtil; import java.arthas.SpyAPI; import java.io.File; import java.lang.instrument.Instrumentation; import java.net.URL; import java.util.HashMap; import java.util.Map; /** * * @author hengyunabc 2020-06-22 * */ public class ArthasAgent { private static final int TEMP_DIR_ATTEMPTS = 10000; private static final String ARTHAS_CORE_JAR = "arthas-core.jar"; private static final String ARTHAS_BOOTSTRAP = "com.taobao.arthas.core.server.ArthasBootstrap"; private static final String GET_INSTANCE = "getInstance"; private static final String IS_BIND = "isBind"; private String errorMessage; private Map configMap = new HashMap(); private String arthasHome; private boolean slientInit; private Instrumentation instrumentation; // TODO: ------------ start:二次开发代码 --------------- // public static Object bootstrap = null; // TODO: ------------ end:二次开发代码 --------------- // public ArthasAgent() { this(null, null, false, null); } public ArthasAgent(Map configMap) { this(configMap, null, false, null); } public ArthasAgent(String arthasHome) { this(null, arthasHome, false, null); } public ArthasAgent(Map configMap, String arthasHome, boolean slientInit, Instrumentation instrumentation) { if (configMap != null) { this.configMap = configMap; } this.arthasHome = arthasHome; this.slientInit = slientInit; this.instrumentation = instrumentation; } public static void attach() { new ArthasAgent().init(); } /** * @see https://arthas.aliyun.com/doc/arthas-properties.html * @param configMap */ public static void attach(Map configMap) { new ArthasAgent(configMap).init(); } /** * use the specified arthas * @param arthasHome arthas directory */ public static void attach(String arthasHome) { new ArthasAgent().init(); } public void init() throws IllegalStateException { // 尝试判断arthas是否已在运行,如果是的话,直接就退出 try { Class.forName("java.arthas.SpyAPI"); // 加载不到会抛异常 if (SpyAPI.isInited()) { return; } } catch (Throwable e) { // ignore } try { if (instrumentation == null) { instrumentation = ByteBuddyAgent.install(); } // 检查 arthasHome if (arthasHome == null || arthasHome.trim().isEmpty()) { // 解压出 arthasHome URL coreJarUrl = this.getClass().getClassLoader().getResource("arthas-bin.zip"); if (coreJarUrl != null) { File tempArthasDir = createTempDir(); ZipUtil.unpack(coreJarUrl.openStream(), tempArthasDir); arthasHome = tempArthasDir.getAbsolutePath(); } else { throw new IllegalArgumentException("can not getResources arthas-bin.zip from classloader: " + this.getClass().getClassLoader()); } } // find arthas-core.jar File arthasCoreJarFile = new File(arthasHome, ARTHAS_CORE_JAR); if (!arthasCoreJarFile.exists()) { throw new IllegalStateException("can not find arthas-core.jar under arthasHome: " + arthasHome); } AttachArthasClassloader arthasClassLoader = new AttachArthasClassloader( new URL[] { arthasCoreJarFile.toURI().toURL() }); /** *
             * ArthasBootstrap bootstrap = ArthasBootstrap.getInstance(inst);
             * 
*/ Class bootstrapClass = arthasClassLoader.loadClass(ARTHAS_BOOTSTRAP); bootstrap = bootstrapClass.getMethod(GET_INSTANCE, Instrumentation.class, Map.class).invoke(null, instrumentation, configMap); boolean isBind = (Boolean) bootstrapClass.getMethod(IS_BIND).invoke(bootstrap); if (!isBind) { String errorMsg = "Arthas server port binding failed! Please check $HOME/logs/arthas/arthas.log for more details."; throw new RuntimeException(errorMsg); } } catch (Throwable e) { errorMessage = e.getMessage(); if (!slientInit) { throw new IllegalStateException(e); } } } private static File createTempDir() { File baseDir = new File(System.getProperty("java.io.tmpdir")); String baseName = "arthas-" + System.currentTimeMillis() + "-"; for (int counter = 0; counter < TEMP_DIR_ATTEMPTS; counter++) { File tempDir = new File(baseDir, baseName + counter); if (tempDir.mkdir()) { return tempDir; } } throw new IllegalStateException("Failed to create directory within " + TEMP_DIR_ATTEMPTS + " attempts (tried " + baseName + "0 to " + baseName + (TEMP_DIR_ATTEMPTS - 1) + ')'); } public String getErrorMessage() { return errorMessage; } public void setErrorMessage(String errorMessage) { this.errorMessage = errorMessage; } } ================================================ FILE: fire-enhance/apache-flink/pom.xml ================================================ 4.0.0 fire-enhance-flink_${flink.reference} 2.3.2-SNAPSHOT jar Fire : Enhance : Flink com.zto.fire fire-enhance 2.3.2-SNAPSHOT ../pom.xml com.zto.fire fire-common_${scala.binary.version} ${fire.version} ${maven.scope} com.sparkjava spark-core ${sparkjava.version} javax.servlet javax.servlet-api 3.1.0 org.apache.flink flink-java ${flink.version} ${maven.scope} org.apache.flink flink-scala_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-streaming-scala_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-clients_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-runtime-web_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-queryable-state-client-java ${flink.version} ${maven.scope} org.apache.flink flink-statebackend-rocksdb_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-connector-kafka_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.kafka kafka_${scala.binary.version} ${kafka.version} ${maven.scope} org.apache.flink flink-table-api-java ${flink.version} ${maven.scope} org.apache.flink flink-table-api-java-bridge_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-table-api-scala-bridge_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-table-common ${flink.version} ${maven.scope} org.apache.flink flink-connector-hive_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-connector-jdbc_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-connector-elasticsearch-base_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-hadoop-compatibility_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-shaded-hadoop-2-uber 2.6.5-8.0 ${maven.scope} org.apache.hive hive-exec ${hive.apache.version} ${maven.scope} org.apache.hbase hbase-common ${hbase.version} ${maven.scope} org.apache.hbase hbase-client org.apache.hbase hbase-server ${hbase.version} ${maven.scope} org.apache.hbase hbase-client org.apache.hbase hbase-client ${hbase.version} ${maven.scope} org.apache.rocketmq rocketmq-client ${rocketmq.version} ${maven.scope} org.apache.rocketmq rocketmq-acl ${rocketmq.version} ${maven.scope} org.apache.curator curator-recipes ${curator.verrsion} org.apache.calcite calcite-core ${calcite.version} ${maven.scope} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.12/org/apache/flink/client/deployment/application/ApplicationDispatcherBootstrap.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.client.deployment.application; import org.apache.flink.annotation.Internal; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.JobID; import org.apache.flink.api.common.time.Time; import org.apache.flink.client.ClientUtils; import org.apache.flink.client.cli.ClientOptions; import org.apache.flink.client.deployment.application.executors.EmbeddedExecutor; import org.apache.flink.client.deployment.application.executors.EmbeddedExecutorServiceLoader; import org.apache.flink.client.program.PackagedProgram; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.HighAvailabilityOptions; import org.apache.flink.configuration.PipelineOptionsInternal; import org.apache.flink.core.execution.PipelineExecutorServiceLoader; import org.apache.flink.runtime.clusterframework.ApplicationStatus; import org.apache.flink.runtime.concurrent.FutureUtils; import org.apache.flink.runtime.concurrent.ScheduledExecutor; import org.apache.flink.runtime.dispatcher.DispatcherBootstrap; import org.apache.flink.runtime.dispatcher.DispatcherGateway; import org.apache.flink.runtime.jobgraph.JobGraph; import org.apache.flink.runtime.jobmanager.HighAvailabilityMode; import org.apache.flink.runtime.jobmaster.JobResult; import org.apache.flink.runtime.messages.Acknowledge; import org.apache.flink.runtime.rpc.FatalErrorHandler; import org.apache.flink.util.ExceptionUtils; import org.apache.flink.util.FlinkException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletionException; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; import java.util.function.Function; import java.util.stream.Collectors; import static org.apache.flink.util.Preconditions.checkNotNull; /** * A {@link DispatcherBootstrap} used for running the user's {@code main()} in "Application Mode" * (see FLIP-85). * *

This dispatcher bootstrap submits the recovered {@link JobGraph job graphs} for re-execution * (in case of recovery from a failure), and then submits the remaining jobs of the application for * execution. * *

To achieve this, it works in conjunction with the {@link EmbeddedExecutor EmbeddedExecutor} * which decides if it should submit a job for execution (in case of a new job) or the job was * already recovered and is running. */ @Internal public class ApplicationDispatcherBootstrap implements DispatcherBootstrap { private static final Logger LOG = LoggerFactory.getLogger(ApplicationDispatcherBootstrap.class); public static final JobID ZERO_JOB_ID = new JobID(0, 0); private final PackagedProgram application; private final Collection recoveredJobIds; private final Configuration configuration; private final FatalErrorHandler errorHandler; private final CompletableFuture applicationCompletionFuture; private final CompletableFuture clusterShutdownFuture; private ScheduledFuture applicationExecutionTask; public ApplicationDispatcherBootstrap( final PackagedProgram application, final Collection recoveredJobIds, final Configuration configuration, final DispatcherGateway dispatcherGateway, final ScheduledExecutor scheduledExecutor, final FatalErrorHandler errorHandler) { this.configuration = checkNotNull(configuration); this.recoveredJobIds = checkNotNull(recoveredJobIds); this.application = checkNotNull(application); this.errorHandler = checkNotNull(errorHandler); this.applicationCompletionFuture = fixJobIdAndRunApplicationAsync(dispatcherGateway, scheduledExecutor); this.clusterShutdownFuture = runApplicationAndShutdownClusterAsync(dispatcherGateway); } @Override public void stop() { if (applicationExecutionTask != null) { applicationExecutionTask.cancel(true); } if (applicationCompletionFuture != null) { applicationCompletionFuture.cancel(true); } } @VisibleForTesting ScheduledFuture getApplicationExecutionFuture() { return applicationExecutionTask; } @VisibleForTesting CompletableFuture getApplicationCompletionFuture() { return applicationCompletionFuture; } @VisibleForTesting CompletableFuture getClusterShutdownFuture() { return clusterShutdownFuture; } /** * Runs the user program entrypoint and shuts down the given dispatcherGateway when the * application completes (either successfully or in case of failure). */ private CompletableFuture runApplicationAndShutdownClusterAsync( final DispatcherGateway dispatcherGateway) { return applicationCompletionFuture .handle( (r, t) -> { if (t == null) { LOG.info("Application completed SUCCESSFULLY"); return dispatcherGateway.shutDownCluster( ApplicationStatus.SUCCEEDED); } final Optional exception = ExceptionUtils.findThrowable( t, UnsuccessfulExecutionException.class); if (exception.isPresent()) { final ApplicationStatus applicationStatus = exception.get().getStatus(); if (applicationStatus == ApplicationStatus.CANCELED || applicationStatus == ApplicationStatus.FAILED) { LOG.info("Application {}: ", applicationStatus, t); return dispatcherGateway.shutDownCluster(applicationStatus); } } LOG.warn("Application failed unexpectedly: ", t); this.errorHandler.onFatalError( new FlinkException("Application failed unexpectedly.", t)); return FutureUtils.completedExceptionally(t); }) .thenCompose(Function.identity()); } private CompletableFuture fixJobIdAndRunApplicationAsync( final DispatcherGateway dispatcherGateway, final ScheduledExecutor scheduledExecutor) { final Optional configuredJobId = configuration.getOptional(PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID); if (!HighAvailabilityMode.isHighAvailabilityModeActivated(configuration) && !configuredJobId.isPresent()) { return runApplicationAsync(dispatcherGateway, scheduledExecutor, false); } // TODO: ------------ start:二次开发代码 --------------- // if (!configuredJobId.isPresent()) { String haClusterId = configuration.getString(HighAvailabilityOptions.HA_CLUSTER_ID); String[] splits = (haClusterId != null ? haClusterId : "").split("_"); if (splits != null && splits.length == 3) { configuration.set( PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID, new JobID(Long.valueOf(splits[1]), Long.valueOf(splits[2])).toHexString()); } else { configuration.set( PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID, ZERO_JOB_ID.toHexString()); } } // TODO: ------------ end:二次开发代码 --------------- // return runApplicationAsync(dispatcherGateway, scheduledExecutor, true); } /** * Runs the user program entrypoint by scheduling a task on the given {@code scheduledExecutor}. * The returned {@link CompletableFuture} completes when all jobs of the user application * succeeded. if any of them fails, or if job submission fails. */ private CompletableFuture runApplicationAsync( final DispatcherGateway dispatcherGateway, final ScheduledExecutor scheduledExecutor, final boolean enforceSingleJobExecution) { final CompletableFuture> applicationExecutionFuture = new CompletableFuture<>(); // we need to hand in a future as return value because we need to get those JobIs out // from the scheduled task that executes the user program applicationExecutionTask = scheduledExecutor.schedule( () -> runApplicationEntryPoint( applicationExecutionFuture, dispatcherGateway, scheduledExecutor, enforceSingleJobExecution), 0L, TimeUnit.MILLISECONDS); return applicationExecutionFuture.thenCompose( jobIds -> getApplicationResult(dispatcherGateway, jobIds, scheduledExecutor)); } /** * Runs the user program entrypoint and completes the given {@code jobIdsFuture} with the {@link * JobID JobIDs} of the submitted jobs. * *

This should be executed in a separate thread (or task). */ private void runApplicationEntryPoint( final CompletableFuture> jobIdsFuture, final DispatcherGateway dispatcherGateway, final ScheduledExecutor scheduledExecutor, final boolean enforceSingleJobExecution) { try { final List applicationJobIds = new ArrayList<>(recoveredJobIds); final PipelineExecutorServiceLoader executorServiceLoader = new EmbeddedExecutorServiceLoader( applicationJobIds, dispatcherGateway, scheduledExecutor); ClientUtils.executeProgram( executorServiceLoader, configuration, application, enforceSingleJobExecution, true /* suppress sysout */); if (applicationJobIds.isEmpty()) { jobIdsFuture.completeExceptionally( new ApplicationExecutionException( "The application contains no execute() calls.")); } else { jobIdsFuture.complete(applicationJobIds); } } catch (Throwable t) { // TODO: ------------ start:二次开发代码 --------------- // ExceptionUtils.stringifyException(t); // TODO: ------------ end:二次开发代码 --------------- // jobIdsFuture.completeExceptionally( new ApplicationExecutionException("Could not execute application.", t)); } } private CompletableFuture getApplicationResult( final DispatcherGateway dispatcherGateway, final Collection applicationJobIds, final ScheduledExecutor executor) { final List> jobResultFutures = applicationJobIds.stream() .map( jobId -> unwrapJobResultException( getJobResult(dispatcherGateway, jobId, executor))) .collect(Collectors.toList()); return FutureUtils.waitForAll(jobResultFutures); } private CompletableFuture getJobResult( final DispatcherGateway dispatcherGateway, final JobID jobId, final ScheduledExecutor scheduledExecutor) { final Time timeout = Time.milliseconds(configuration.get(ClientOptions.CLIENT_TIMEOUT).toMillis()); final Time retryPeriod = Time.milliseconds(configuration.get(ClientOptions.CLIENT_RETRY_PERIOD).toMillis()); return JobStatusPollingUtils.getJobResult( dispatcherGateway, jobId, scheduledExecutor, timeout, retryPeriod); } /** * If the given {@link JobResult} indicates success, this passes through the {@link JobResult}. * Otherwise, this returns a future that is finished exceptionally (potentially with an * exception from the {@link JobResult}. */ private CompletableFuture unwrapJobResultException( final CompletableFuture jobResult) { return jobResult.thenApply( result -> { if (result.isSuccess()) { return result; } throw new CompletionException( UnsuccessfulExecutionException.fromJobResult( result, application.getUserCodeClassLoader())); }); } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.12/org/apache/flink/configuration/GlobalConfiguration.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.configuration; import com.zto.fire.common.conf.FireFrameworkConf; import com.zto.fire.common.util.OSUtils; import com.zto.fire.common.util.PropUtils; import org.apache.flink.annotation.Internal; import org.apache.flink.util.Preconditions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.collection.JavaConversions; import javax.annotation.Nullable; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import java.lang.reflect.Method; import java.net.ServerSocket; import java.util.HashMap; import java.util.Map; import java.util.concurrent.atomic.AtomicBoolean; /** * Global configuration object for Flink. Similar to Java properties configuration * objects it includes key-value pairs which represent the framework's configuration. */ @Internal public final class GlobalConfiguration { private static final Logger LOG = LoggerFactory.getLogger(GlobalConfiguration.class); private static AtomicBoolean isStart = new AtomicBoolean(false); public static final String FLINK_CONF_FILENAME = "flink-conf.yaml"; // the hidden content to be displayed public static final String HIDDEN_CONTENT = "******"; // TODO: ------------ start:二次开发代码 --------------- // // 用于判断是JobManager还是TaskManager private static boolean isJobManager = false; // fire rest服务占用端口 private static ServerSocket restServerSocket; // 任务的运行模式 private static String runMode; private static final Map settings = new HashMap<>(); static { try { restServerSocket = new ServerSocket(0); } catch (Exception e) { LOG.error("创建Socket失败", e); } } /** * 获取配置信息 */ public static Map getSettings() { return settings; } /** * 获取随机分配的Rest端口号 */ public static int getRestPort() { return restServerSocket.getLocalPort(); } /** * 获取rest服务端口号,并关闭Socket */ public static int getRestPortAndClose() { int port = restServerSocket.getLocalPort(); if (restServerSocket != null && !restServerSocket.isClosed()) { try { restServerSocket.close(); } catch (Exception e) { LOG.error("关闭Rest Socket失败", e); } } return port; } // TODO: ------------ end:二次开发代码 ----------------- // // -------------------------------------------------------------------------------------------- private GlobalConfiguration() { } // -------------------------------------------------------------------------------------------- /** * Loads the global configuration from the environment. Fails if an error occurs during loading. Returns an * empty configuration object if the environment variable is not set. In production this variable is set but * tests and local execution/debugging don't have this environment variable set. That's why we should fail * if it is not set. * * @return Returns the Configuration */ public static Configuration loadConfiguration() { return loadConfiguration(new Configuration()); } /** * Loads the global configuration and adds the given dynamic properties * configuration. * * @param dynamicProperties The given dynamic properties * @return Returns the loaded global configuration with dynamic properties */ public static Configuration loadConfiguration(Configuration dynamicProperties) { final String configDir = System.getenv(ConfigConstants.ENV_FLINK_CONF_DIR); if (configDir == null) { return new Configuration(dynamicProperties); } return loadConfiguration(configDir, dynamicProperties); } /** * Loads the configuration files from the specified directory. * *

YAML files are supported as configuration files. * * @param configDir the directory which contains the configuration files */ public static Configuration loadConfiguration(final String configDir) { isJobManager = true; return loadConfiguration(configDir, null); } /** * Loads the configuration files from the specified directory. If the dynamic properties * configuration is not null, then it is added to the loaded configuration. * * @param configDir directory to load the configuration from * @param dynamicProperties configuration file containing the dynamic properties. Null if none. * @return The configuration loaded from the given configuration directory */ public static Configuration loadConfiguration(final String configDir, @Nullable final Configuration dynamicProperties) { if (configDir == null) { throw new IllegalArgumentException("Given configuration directory is null, cannot load configuration"); } final File confDirFile = new File(configDir); if (!(confDirFile.exists())) { throw new IllegalConfigurationException( "The given configuration directory name '" + configDir + "' (" + confDirFile.getAbsolutePath() + ") does not describe an existing directory."); } // get Flink yaml configuration file final File yamlConfigFile = new File(confDirFile, FLINK_CONF_FILENAME); if (!yamlConfigFile.exists()) { throw new IllegalConfigurationException( "The Flink config file '" + yamlConfigFile + "' (" + confDirFile.getAbsolutePath() + ") does not exist."); } Configuration configuration = loadYAMLResource(yamlConfigFile); if (dynamicProperties != null) { configuration.addAll(dynamicProperties); } return configuration; } /** * Loads a YAML-file of key-value pairs. * *

Colon and whitespace ": " separate key and value (one per line). The hash tag "#" starts a single-line comment. * *

Example: * *

     * jobmanager.rpc.address: localhost # network address for communication with the job manager
     * jobmanager.rpc.port   : 6123      # network port to connect to for communication with the job manager
     * taskmanager.rpc.port  : 6122      # network port the task manager expects incoming IPC connections
     * 
* *

This does not span the whole YAML specification, but only the *syntax* of simple YAML key-value pairs (see issue * #113 on GitHub). If at any point in time, there is a need to go beyond simple key-value pairs syntax * compatibility will allow to introduce a YAML parser library. * * @param file the YAML file to read from * @see YAML 1.2 specification */ private static Configuration loadYAMLResource(File file) { final Configuration config = new Configuration(); Method setSetting = null; try { Class env = Class.forName("org.apache.flink.runtime.util.EnvironmentInformation"); setSetting = env.getMethod("setSetting", String.class, String.class); } catch (Exception e) { LOG.error("获取EnvironmentInformation.setSetting()失败", e); } try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)))) { String line; int lineNo = 0; while ((line = reader.readLine()) != null) { lineNo++; // 1. check for comments String[] comments = line.split("#", 2); String conf = comments[0].trim(); // 2. get key and value if (conf.length() > 0) { String[] kv = conf.split(": ", 2); // skip line with no valid key-value pair if (kv.length == 1) { LOG.warn("Error while trying to split key and value in configuration file {}:{}: {}", file, lineNo, line); continue; } String key = kv[0].trim(); String value = kv[1].trim(); // sanity check if (key.length() == 0 || value.length() == 0) { LOG.warn("Error after splitting key and value in configuration file {}:{}:{}", file, lineNo, line); continue; } LOG.info("Loading configuration property: {}, {}", key, isSensitive(key) ? HIDDEN_CONTENT : value); config.setString(key, value); // TODO: ------------ start:二次开发代码 --------------- // setSetting.invoke(null, key, value); // TODO: ------------ end:二次开发代码 --------------- // } } } catch (Exception e) { throw new RuntimeException("Error parsing YAML configuration.", e); } fireBootstrap(config); return config; } // TODO: ------------ start:二次开发代码 --------------- // /** * fire框架相关初始化动作 */ private static void fireBootstrap(Configuration config) { if (isStart.compareAndSet(false, true)) { // 加载必要的配置文件 loadTaskConfiguration(config); } } /** * 获取当前任务运行模式 */ public static String getRunMode() { return runMode; } /** * 加载必要的配置文件 */ private static void loadTaskConfiguration(Configuration config) { // 用于加载任务同名配置文件中的flink参数 // 获取当前任务的类名称 String className = config.getString("$internal.application.main", config.getString("flink.fire.className", "")); // 获取当前任务的运行模式:yarn-application或yarn-per-job runMode = config.getString("flink.execution.target", config.getString("execution.target", "")); try { Class env = Class.forName("org.apache.flink.runtime.util.EnvironmentInformation"); Method method = env.getMethod("isJobManager"); isJobManager = Boolean.valueOf(method.invoke(null) + ""); } catch (Exception e) { LOG.error("调用EnvironmentInformation.isJobManager()失败", e); } // 配置信息仅在JobManager端进行加载,TaskManager端会被主动的merge if (isJobManager && className != null && className.contains(".")) { String simpleClassName = className.substring(className.lastIndexOf('.') + 1); if (simpleClassName.length() > 0) { PropUtils.setProperty("driver.class.name", className); // TODO: 判断批处理模式,并加载对应配置文件 // PropUtils.load(FireFrameworkConf.FLINK_BATCH_CONF_FILE) PropUtils.loadFile(FireFrameworkConf.FLINK_STREAMING_CONF_FILE()); // 将所有configuration信息同步到PropUtils中 PropUtils.setProperties(config.confData); // 加载用户公共配置文件 PropUtils.load(FireFrameworkConf.userCommonConf()); // 加载任务同名的配置文件 // PropUtils.loadJobConf(className); // 构建fire rest接口地址 PropUtils.setProperty(FireFrameworkConf.FIRE_REST_URL(), "http://" + OSUtils.getIp() + ":" + getRestPort()); // 加载外部系统配置信息,覆盖同名配置文件中的配置,实现动态替换 PropUtils.loadJobConf(className); PropUtils.setProperty("flink.run.mode", runMode); Map settingMap = (Map) JavaConversions.mapAsJavaMap(PropUtils.settings()); settingMap.forEach((k, v) -> { config.setString(k, v); settings.put(k, v); }); } } } /** * Check whether the key is a hidden key. * * @param key the config key */ public static boolean isSensitive(String key) { Preconditions.checkNotNull(key, "key is null"); final String keyInLower = key.toLowerCase(); // 用于隐藏webui中敏感信息 String hideKeys = ((Map) JavaConversions.mapAsJavaMap(PropUtils.settings())).getOrDefault("fire.conf.print.blacklist", "password,secret,fs.azure.account.key"); if (hideKeys != null && hideKeys.length() > 0) { String[] hideKeyArr = hideKeys.split(","); for (String hideKey : hideKeyArr) { if (keyInLower.length() >= hideKey.length() && keyInLower.contains(hideKey)) { return true; } } } return false; } // TODO: ------------ end:二次开发代码 ----------------- // } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.12/org/apache/flink/contrib/streaming/state/RocksDBStateBackend.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.contrib.streaming.state; import com.zto.fire.common.util.PropUtils; import org.apache.commons.lang3.StringUtils; import org.apache.curator.framework.CuratorFramework; import org.apache.curator.framework.CuratorFrameworkFactory; import org.apache.curator.framework.recipes.atomic.AtomicValue; import org.apache.curator.framework.recipes.atomic.DistributedAtomicInteger; import org.apache.curator.retry.ExponentialBackoffRetry; import org.apache.curator.retry.RetryOneTime; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.ExecutionConfig; import org.apache.flink.api.common.JobID; import org.apache.flink.api.common.typeutils.TypeSerializer; import org.apache.flink.configuration.CheckpointingOptions; import org.apache.flink.configuration.IllegalConfigurationException; import org.apache.flink.configuration.ReadableConfig; import org.apache.flink.configuration.TaskManagerOptions; import org.apache.flink.core.fs.CloseableRegistry; import org.apache.flink.core.fs.Path; import org.apache.flink.metrics.MetricGroup; import org.apache.flink.runtime.execution.Environment; import org.apache.flink.runtime.memory.OpaqueMemoryResource; import org.apache.flink.runtime.query.TaskKvStateRegistry; import org.apache.flink.runtime.state.*; import org.apache.flink.runtime.state.filesystem.FsStateBackend; import org.apache.flink.runtime.state.ttl.TtlTimeProvider; import org.apache.flink.util.*; import org.rocksdb.NativeLibraryLoader; import org.rocksdb.RocksDB; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nonnull; import javax.annotation.Nullable; import java.io.File; import java.io.IOException; import java.lang.reflect.Field; import java.net.URI; import java.util.*; import static org.apache.flink.contrib.streaming.state.RocksDBConfigurableOptions.WRITE_BATCH_SIZE; import static org.apache.flink.contrib.streaming.state.RocksDBOptions.CHECKPOINT_TRANSFER_THREAD_NUM; import static org.apache.flink.contrib.streaming.state.RocksDBOptions.TIMER_SERVICE_FACTORY; import static org.apache.flink.util.Preconditions.checkArgument; import static org.apache.flink.util.Preconditions.checkNotNull; /** * A State Backend that stores its state in {@code RocksDB}. This state backend can store very large * state that exceeds memory and spills to disk. * *

All key/value state (including windows) is stored in the key/value index of RocksDB. For * persistence against loss of machines, checkpoints take a snapshot of the RocksDB database, and * persist that snapshot in a file system (by default) or another configurable state backend. * *

The behavior of the RocksDB instances can be parametrized by setting RocksDB Options using the * methods {@link #setPredefinedOptions(PredefinedOptions)} and {@link * #setRocksDBOptions(RocksDBOptionsFactory)}. */ public class RocksDBStateBackend extends AbstractManagedMemoryStateBackend implements ConfigurableStateBackend { /** * The options to chose for the type of priority queue state. */ public enum PriorityQueueStateType { HEAP, ROCKSDB } private static final long serialVersionUID = 1L; private static final Logger LOG = LoggerFactory.getLogger(RocksDBStateBackend.class); /** * The number of (re)tries for loading the RocksDB JNI library. */ private static final int ROCKSDB_LIB_LOADING_ATTEMPTS = 3; /** * Flag whether the native library has been loaded. */ private static boolean rocksDbInitialized = false; private static final int UNDEFINED_NUMBER_OF_TRANSFER_THREADS = -1; private static final long UNDEFINED_WRITE_BATCH_SIZE = -1; // ------------------------------------------------------------------------ // -- configuration values, set in the application / configuration /** * The state backend that we use for creating checkpoint streams. */ private final StateBackend checkpointStreamBackend; /** * Base paths for RocksDB directory, as configured. Null if not yet set, in which case the * configuration values will be used. The configuration defaults to the TaskManager's temp * directories. */ @Nullable private File[] localRocksDbDirectories; /** * The pre-configured option settings. */ @Nullable private PredefinedOptions predefinedOptions; /** * The options factory to create the RocksDB options in the cluster. */ @Nullable private RocksDBOptionsFactory rocksDbOptionsFactory; /** * This determines if incremental checkpointing is enabled. */ private final TernaryBoolean enableIncrementalCheckpointing; /** * Thread number used to transfer (download and upload) state, default value: 1. */ private int numberOfTransferThreads; /** * The configuration for memory settings (pool sizes, etc.). */ private final RocksDBMemoryConfiguration memoryConfiguration; /** * This determines the type of priority queue state. */ @Nullable private PriorityQueueStateType priorityQueueStateType; /** * The default rocksdb metrics options. */ private final RocksDBNativeMetricOptions defaultMetricOptions; // -- runtime values, set on TaskManager when initializing / using the backend /** * Base paths for RocksDB directory, as initialized. */ private transient File[] initializedDbBasePaths; /** * JobID for uniquifying backup paths. */ private transient JobID jobId; /** * The index of the next directory to be used from {@link #initializedDbBasePaths}. */ private transient int nextDirectory; /** * Whether we already lazily initialized our local storage directories. */ private transient boolean isInitialized; /** * Max consumed memory size for one batch in {@link RocksDBWriteBatchWrapper}, default value * 2mb. */ private long writeBatchSize; // ------------------------------------------------------------------------ // TODO: ------------ start:二次开发代码 --------------- // /** * State disk choose policy */ private static final String FLINK_STATE_DISK_CHOOSE_POLICY_ROUND_ROBIN = "ROUND_ROBIN"; /** * Default state disk choose policy */ private static final String FLINK_STATE_DISK_CHOOSE_POLICY_DEFAULT = "DEFAULT"; /** * distributed dir on each taskManager */ private DistributedAtomicInteger dirIndex; /** * state choose disk policy */ private String stateDiskPolicy; private transient CuratorFramework client; private String currentHostName; // 初始化标识,避免多次初始化 private boolean isInitZKClient = false; // 用于统计磁盘负载的zk地址 private final static String STATE_ZOOKEEPER_URL = "flink.state.external.zookeeper.url"; // 状态本地磁盘路径选取策略:default/round_robin private final static String STATE_CHOOSE_DISK_POLICY = "flink.state.choose.disk.policy"; /** * 初始化round_robin策略下的zookeeper连接 */ private void initZKClient() { synchronized (RocksDBStateBackend.class) { if (isInitZKClient) { return; } this.isInitZKClient = true; final String zkUrl = PropUtils.getString(STATE_ZOOKEEPER_URL, ""); this.stateDiskPolicy = PropUtils.getString(STATE_CHOOSE_DISK_POLICY, FLINK_STATE_DISK_CHOOSE_POLICY_DEFAULT).toUpperCase(); LOG.info("当前磁盘路径选择策略:" + this.stateDiskPolicy); // 如果zk地址不为空,并且开启了ROUND_ROBIN磁盘路径选择策略,则建立zookeeper的连接,避免太多任务建立太多的连接 if (StringUtils.isNotBlank(zkUrl) && this.isRoundRobin()) { try { LOG.info("开启基于zookeeper的本地磁盘状态路径选择策略"); this.client = CuratorFrameworkFactory.builder().connectString(zkUrl) .connectionTimeoutMs(5000).retryPolicy(new RetryOneTime(5000)).build(); this.client.start(); Runtime.getRuntime().addShutdownHook(new Thread(() -> { if (client != null) { client.close(); LOG.info("释放基于zookeeper的本地磁盘状态路径选择策略的连接"); } })); } catch (Exception e) { LOG.error("初始化CuratorFrameworkFactory失败", e); } } } } /** * 判断是否为ROUND_ROBIN模式 */ private boolean isRoundRobin() { if (!this.isInitZKClient) { this.initZKClient(); } return FLINK_STATE_DISK_CHOOSE_POLICY_ROUND_ROBIN.equalsIgnoreCase(this.stateDiskPolicy); } // TODO: ------------ end:二次开发代码 --------------- // /** * Creates a new {@code RocksDBStateBackend} that stores its checkpoint data in the file system * and location defined by the given URI. * *

A state backend that stores checkpoints in HDFS or S3 must specify the file system host * and port in the URI, or have the Hadoop configuration that describes the file system (host / * high-availability group / possibly credentials) either referenced from the Flink config, or * included in the classpath. * * @param checkpointDataUri The URI describing the filesystem and path to the checkpoint data * directory. * @throws IOException Thrown, if no file system can be found for the scheme in the URI. */ public RocksDBStateBackend(String checkpointDataUri) throws IOException { this(new Path(checkpointDataUri).toUri()); } /** * Creates a new {@code RocksDBStateBackend} that stores its checkpoint data in the file system * and location defined by the given URI. * *

A state backend that stores checkpoints in HDFS or S3 must specify the file system host * and port in the URI, or have the Hadoop configuration that describes the file system (host / * high-availability group / possibly credentials) either referenced from the Flink config, or * included in the classpath. * * @param checkpointDataUri The URI describing the filesystem and path to the checkpoint data * directory. * @param enableIncrementalCheckpointing True if incremental checkpointing is enabled. * @throws IOException Thrown, if no file system can be found for the scheme in the URI. */ public RocksDBStateBackend(String checkpointDataUri, boolean enableIncrementalCheckpointing) throws IOException { this(new Path(checkpointDataUri).toUri(), enableIncrementalCheckpointing); } /** * Creates a new {@code RocksDBStateBackend} that stores its checkpoint data in the file system * and location defined by the given URI. * *

A state backend that stores checkpoints in HDFS or S3 must specify the file system host * and port in the URI, or have the Hadoop configuration that describes the file system (host / * high-availability group / possibly credentials) either referenced from the Flink config, or * included in the classpath. * * @param checkpointDataUri The URI describing the filesystem and path to the checkpoint data * directory. * @throws IOException Thrown, if no file system can be found for the scheme in the URI. */ public RocksDBStateBackend(URI checkpointDataUri) throws IOException { this(new FsStateBackend(checkpointDataUri)); } /** * Creates a new {@code RocksDBStateBackend} that stores its checkpoint data in the file system * and location defined by the given URI. * *

A state backend that stores checkpoints in HDFS or S3 must specify the file system host * and port in the URI, or have the Hadoop configuration that describes the file system (host / * high-availability group / possibly credentials) either referenced from the Flink config, or * included in the classpath. * * @param checkpointDataUri The URI describing the filesystem and path to the checkpoint data * directory. * @param enableIncrementalCheckpointing True if incremental checkpointing is enabled. * @throws IOException Thrown, if no file system can be found for the scheme in the URI. */ public RocksDBStateBackend(URI checkpointDataUri, boolean enableIncrementalCheckpointing) throws IOException { this(new FsStateBackend(checkpointDataUri), enableIncrementalCheckpointing); } /** * Creates a new {@code RocksDBStateBackend} that uses the given state backend to store its * checkpoint data streams. Typically, one would supply a filesystem or database state backend * here where the snapshots from RocksDB would be stored. * *

The snapshots of the RocksDB state will be stored using the given backend's {@link * StateBackend#createCheckpointStorage(JobID)}. * * @param checkpointStreamBackend The backend write the checkpoint streams to. */ public RocksDBStateBackend(StateBackend checkpointStreamBackend) { this(checkpointStreamBackend, TernaryBoolean.UNDEFINED); } /** * Creates a new {@code RocksDBStateBackend} that uses the given state backend to store its * checkpoint data streams. Typically, one would supply a filesystem or database state backend * here where the snapshots from RocksDB would be stored. * *

The snapshots of the RocksDB state will be stored using the given backend's {@link * StateBackend#createCheckpointStorage(JobID)}. * * @param checkpointStreamBackend The backend write the checkpoint streams to. * @param enableIncrementalCheckpointing True if incremental checkpointing is enabled. */ public RocksDBStateBackend( StateBackend checkpointStreamBackend, TernaryBoolean enableIncrementalCheckpointing) { this.checkpointStreamBackend = checkNotNull(checkpointStreamBackend); this.enableIncrementalCheckpointing = enableIncrementalCheckpointing; this.numberOfTransferThreads = UNDEFINED_NUMBER_OF_TRANSFER_THREADS; this.defaultMetricOptions = new RocksDBNativeMetricOptions(); this.memoryConfiguration = new RocksDBMemoryConfiguration(); this.writeBatchSize = UNDEFINED_WRITE_BATCH_SIZE; // TODO: ------------ start:二次开发代码 --------------- // this.initZKClient(); // TODO: ------------ end:二次开发代码 --------------- // } /** * @deprecated Use {@link #RocksDBStateBackend(StateBackend)} instead. */ @Deprecated public RocksDBStateBackend(AbstractStateBackend checkpointStreamBackend) { this(checkpointStreamBackend, TernaryBoolean.UNDEFINED); } /** * @deprecated Use {@link #RocksDBStateBackend(StateBackend, TernaryBoolean)} instead. */ @Deprecated public RocksDBStateBackend( AbstractStateBackend checkpointStreamBackend, boolean enableIncrementalCheckpointing) { this(checkpointStreamBackend, TernaryBoolean.fromBoolean(enableIncrementalCheckpointing)); } /** * Private constructor that creates a re-configured copy of the state backend. * * @param original The state backend to re-configure. * @param config The configuration. * @param classLoader The class loader. */ private RocksDBStateBackend( RocksDBStateBackend original, ReadableConfig config, ClassLoader classLoader) { // reconfigure the state backend backing the streams final StateBackend originalStreamBackend = original.checkpointStreamBackend; this.checkpointStreamBackend = originalStreamBackend instanceof ConfigurableStateBackend ? ((ConfigurableStateBackend) originalStreamBackend) .configure(config, classLoader) : originalStreamBackend; // configure incremental checkpoints this.enableIncrementalCheckpointing = original.enableIncrementalCheckpointing.resolveUndefined( config.get(CheckpointingOptions.INCREMENTAL_CHECKPOINTS)); if (original.numberOfTransferThreads == UNDEFINED_NUMBER_OF_TRANSFER_THREADS) { this.numberOfTransferThreads = config.get(CHECKPOINT_TRANSFER_THREAD_NUM); } else { this.numberOfTransferThreads = original.numberOfTransferThreads; } if (original.writeBatchSize == UNDEFINED_WRITE_BATCH_SIZE) { this.writeBatchSize = config.get(WRITE_BATCH_SIZE).getBytes(); } else { this.writeBatchSize = original.writeBatchSize; } this.memoryConfiguration = RocksDBMemoryConfiguration.fromOtherAndConfiguration( original.memoryConfiguration, config); this.memoryConfiguration.validate(); if (null == original.priorityQueueStateType) { this.priorityQueueStateType = config.get(TIMER_SERVICE_FACTORY); } else { this.priorityQueueStateType = original.priorityQueueStateType; } // configure local directories if (original.localRocksDbDirectories != null) { this.localRocksDbDirectories = original.localRocksDbDirectories; } else { final String rocksdbLocalPaths = config.get(RocksDBOptions.LOCAL_DIRECTORIES); if (rocksdbLocalPaths != null) { String[] directories = rocksdbLocalPaths.split(",|" + File.pathSeparator); try { setDbStoragePaths(directories); } catch (IllegalArgumentException e) { throw new IllegalConfigurationException( "Invalid configuration for RocksDB state " + "backend's local storage directories: " + e.getMessage(), e); } } } // configure metric options this.defaultMetricOptions = RocksDBNativeMetricOptions.fromConfig(config); // configure RocksDB predefined options this.predefinedOptions = original.predefinedOptions == null ? PredefinedOptions.valueOf(config.get(RocksDBOptions.PREDEFINED_OPTIONS)) : original.predefinedOptions; LOG.info("Using predefined options: {}.", predefinedOptions.name()); // configure RocksDB options factory try { rocksDbOptionsFactory = configureOptionsFactory( original.rocksDbOptionsFactory, config.get(RocksDBOptions.OPTIONS_FACTORY), config, classLoader); } catch (DynamicCodeLoadingException e) { throw new FlinkRuntimeException(e); } // TODO: ------------ start:二次开发代码 --------------- // this.initZKClient(); // TODO: ------------ end:二次开发代码 --------------- // } // ------------------------------------------------------------------------ // Reconfiguration // ------------------------------------------------------------------------ /** * Creates a copy of this state backend that uses the values defined in the configuration for * fields where that were not yet specified in this state backend. * * @param config The configuration. * @param classLoader The class loader. * @return The re-configured variant of the state backend */ @Override public RocksDBStateBackend configure(ReadableConfig config, ClassLoader classLoader) { return new RocksDBStateBackend(this, config, classLoader); } // ------------------------------------------------------------------------ // State backend methods // ------------------------------------------------------------------------ /** * Gets the state backend that this RocksDB state backend uses to persist its bytes to. * *

This RocksDB state backend only implements the RocksDB specific parts, it relies on the * 'CheckpointBackend' to persist the checkpoint and savepoint bytes streams. */ public StateBackend getCheckpointBackend() { return checkpointStreamBackend; } private void lazyInitializeForJob( Environment env, @SuppressWarnings("unused") String operatorIdentifier) throws IOException { if (isInitialized) { return; } this.jobId = env.getJobID(); // initialize the paths where the local RocksDB files should be stored if (localRocksDbDirectories == null) { // initialize from the temp directories initializedDbBasePaths = env.getIOManager().getSpillingDirectories(); LOG.info("initializedDbBasePaths.size:" + initializedDbBasePaths.length); for (File file : initializedDbBasePaths) { LOG.info("initializedDbBasePaths:" + file.getPath()); } } else { List dirs = new ArrayList<>(localRocksDbDirectories.length); StringBuilder errorMessage = new StringBuilder(); for (File f : localRocksDbDirectories) { File testDir = new File(f, UUID.randomUUID().toString()); if (!testDir.mkdirs()) { String msg = "Local DB files directory '" + f + "' does not exist and cannot be created. "; LOG.error(msg); errorMessage.append(msg); } else { dirs.add(f); } //noinspection ResultOfMethodCallIgnored testDir.delete(); } if (dirs.isEmpty()) { throw new IOException("No local storage directories available. " + errorMessage); } else { initializedDbBasePaths = dirs.toArray(new File[dirs.size()]); } } // TODO: ------------ start:二次开发代码 --------------- // if (isRoundRobin()) { this.currentHostName = env.getTaskManagerInfo().getConfiguration().getString(TaskManagerOptions.HOST); } // TODO: ------------ end:二次开发代码 --------------- // nextDirectory = new Random().nextInt(initializedDbBasePaths.length); isInitialized = true; } private File getNextStoragePath() { // TODO: ------------ start:二次开发代码 --------------- // int ni = nextDirectory; if (isRoundRobin()) { try { String counterPath = "/rocksDB/" + this.currentHostName; ExponentialBackoffRetry retryPolicy = new ExponentialBackoffRetry(1000, 10); this.dirIndex = new DistributedAtomicInteger(this.client, counterPath, retryPolicy); this.dirIndex.initialize(0); AtomicValue value = this.dirIndex.increment(); if (value.succeeded()) { ni = value.postValue() % initializedDbBasePaths.length; } else { ni = new Random().nextInt(initializedDbBasePaths.length); } } catch (Exception e) { ni = new Random().nextInt(initializedDbBasePaths.length); LOG.error("基于zookeeper的本地状态磁盘路径选择发生异常,请在commons.properties文件中指定以下参数恢复到flink默认的选择策略:flink.state.choose.disk.policy=default", e); } } else { ni = nextDirectory + 1; ni = ni >= initializedDbBasePaths.length ? 0 : ni; nextDirectory = ni; } LOG.info("Next state file storage path is: " + initializedDbBasePaths[ni].getPath()); // TODO: ------------ end:二次开发代码 --------------- // return initializedDbBasePaths[ni]; } // ------------------------------------------------------------------------ // Checkpoint initialization and persistent storage // ------------------------------------------------------------------------ @Override public CompletedCheckpointStorageLocation resolveCheckpoint(String pointer) throws IOException { return checkpointStreamBackend.resolveCheckpoint(pointer); } @Override public CheckpointStorageAccess createCheckpointStorage(JobID jobId) throws IOException { return checkpointStreamBackend.createCheckpointStorage(jobId); } // ------------------------------------------------------------------------ // State holding data structures // ------------------------------------------------------------------------ @Override public AbstractKeyedStateBackend createKeyedStateBackend( Environment env, JobID jobID, String operatorIdentifier, TypeSerializer keySerializer, int numberOfKeyGroups, KeyGroupRange keyGroupRange, TaskKvStateRegistry kvStateRegistry, TtlTimeProvider ttlTimeProvider, MetricGroup metricGroup, @Nonnull Collection stateHandles, CloseableRegistry cancelStreamRegistry) throws IOException { return createKeyedStateBackend( env, jobID, operatorIdentifier, keySerializer, numberOfKeyGroups, keyGroupRange, kvStateRegistry, ttlTimeProvider, metricGroup, stateHandles, cancelStreamRegistry, 1.0); } @Override public AbstractKeyedStateBackend createKeyedStateBackend( Environment env, JobID jobID, String operatorIdentifier, TypeSerializer keySerializer, int numberOfKeyGroups, KeyGroupRange keyGroupRange, TaskKvStateRegistry kvStateRegistry, TtlTimeProvider ttlTimeProvider, MetricGroup metricGroup, @Nonnull Collection stateHandles, CloseableRegistry cancelStreamRegistry, double managedMemoryFraction) throws IOException { // first, make sure that the RocksDB JNI library is loaded // we do this explicitly here to have better error handling String tempDir = env.getTaskManagerInfo().getTmpDirectories()[0]; ensureRocksDBIsLoaded(tempDir); // replace all characters that are not legal for filenames with underscore String fileCompatibleIdentifier = operatorIdentifier.replaceAll("[^a-zA-Z0-9\\-]", "_"); lazyInitializeForJob(env, fileCompatibleIdentifier); File instanceBasePath = new File( getNextStoragePath(), "job_" + jobId + "_op_" + fileCompatibleIdentifier + "_uuid_" + UUID.randomUUID()); LocalRecoveryConfig localRecoveryConfig = env.getTaskStateManager().createLocalRecoveryConfig(); final OpaqueMemoryResource sharedResources = RocksDBOperationUtils.allocateSharedCachesIfConfigured( memoryConfiguration, env.getMemoryManager(), managedMemoryFraction, LOG); if (sharedResources != null) { LOG.info("Obtained shared RocksDB cache of size {} bytes", sharedResources.getSize()); } final RocksDBResourceContainer resourceContainer = createOptionsAndResourceContainer(sharedResources); ExecutionConfig executionConfig = env.getExecutionConfig(); StreamCompressionDecorator keyGroupCompressionDecorator = getCompressionDecorator(executionConfig); RocksDBKeyedStateBackendBuilder builder = new RocksDBKeyedStateBackendBuilder<>( operatorIdentifier, env.getUserCodeClassLoader().asClassLoader(), instanceBasePath, resourceContainer, stateName -> resourceContainer.getColumnOptions(), kvStateRegistry, keySerializer, numberOfKeyGroups, keyGroupRange, executionConfig, localRecoveryConfig, getPriorityQueueStateType(), ttlTimeProvider, metricGroup, stateHandles, keyGroupCompressionDecorator, cancelStreamRegistry) .setEnableIncrementalCheckpointing(isIncrementalCheckpointsEnabled()) .setNumberOfTransferingThreads(getNumberOfTransferThreads()) .setNativeMetricOptions( resourceContainer.getMemoryWatcherOptions(defaultMetricOptions)) .setWriteBatchSize(getWriteBatchSize()); return builder.build(); } @Override public OperatorStateBackend createOperatorStateBackend( Environment env, String operatorIdentifier, @Nonnull Collection stateHandles, CloseableRegistry cancelStreamRegistry) throws Exception { // the default for RocksDB; eventually there can be a operator state backend based on // RocksDB, too. final boolean asyncSnapshots = true; return new DefaultOperatorStateBackendBuilder( env.getUserCodeClassLoader().asClassLoader(), env.getExecutionConfig(), asyncSnapshots, stateHandles, cancelStreamRegistry) .build(); } private RocksDBOptionsFactory configureOptionsFactory( @Nullable RocksDBOptionsFactory originalOptionsFactory, String factoryClassName, ReadableConfig config, ClassLoader classLoader) throws DynamicCodeLoadingException { if (originalOptionsFactory != null) { if (originalOptionsFactory instanceof ConfigurableRocksDBOptionsFactory) { originalOptionsFactory = ((ConfigurableRocksDBOptionsFactory) originalOptionsFactory) .configure(config); } LOG.info("Using application-defined options factory: {}.", originalOptionsFactory); return originalOptionsFactory; } // if using DefaultConfigurableOptionsFactory by default, we could avoid reflection to speed // up. if (factoryClassName.equalsIgnoreCase(DefaultConfigurableOptionsFactory.class.getName())) { DefaultConfigurableOptionsFactory optionsFactory = new DefaultConfigurableOptionsFactory(); optionsFactory.configure(config); LOG.info("Using default options factory: {}.", optionsFactory); return optionsFactory; } else { try { @SuppressWarnings("rawtypes") Class clazz = Class.forName(factoryClassName, false, classLoader) .asSubclass(RocksDBOptionsFactory.class); RocksDBOptionsFactory optionsFactory = clazz.newInstance(); if (optionsFactory instanceof ConfigurableRocksDBOptionsFactory) { optionsFactory = ((ConfigurableRocksDBOptionsFactory) optionsFactory).configure(config); } LOG.info("Using configured options factory: {}.", optionsFactory); return optionsFactory; } catch (ClassNotFoundException e) { throw new DynamicCodeLoadingException( "Cannot find configured options factory class: " + factoryClassName, e); } catch (ClassCastException | InstantiationException | IllegalAccessException e) { throw new DynamicCodeLoadingException( "The class configured under '" + RocksDBOptions.OPTIONS_FACTORY.key() + "' is not a valid options factory (" + factoryClassName + ')', e); } } } // ------------------------------------------------------------------------ // Parameters // ------------------------------------------------------------------------ /** * Gets the memory configuration object, which offers settings to control RocksDB's memory * usage. */ public RocksDBMemoryConfiguration getMemoryConfiguration() { return memoryConfiguration; } /** * Sets the path where the RocksDB local database files should be stored on the local file * system. Setting this path overrides the default behavior, where the files are stored across * the configured temp directories. * *

Passing {@code null} to this function restores the default behavior, where the configured * temp directories will be used. * * @param path The path where the local RocksDB database files are stored. */ public void setDbStoragePath(String path) { setDbStoragePaths(path == null ? null : new String[]{path}); } /** * Sets the directories in which the local RocksDB database puts its files (like SST and * metadata files). These directories do not need to be persistent, they can be ephemeral, * meaning that they are lost on a machine failure, because state in RocksDB is persisted in * checkpoints. * *

If nothing is configured, these directories default to the TaskManager's local temporary * file directories. * *

Each distinct state will be stored in one path, but when the state backend creates * multiple states, they will store their files on different paths. * *

Passing {@code null} to this function restores the default behavior, where the configured * temp directories will be used. * * @param paths The paths across which the local RocksDB database files will be spread. */ public void setDbStoragePaths(String... paths) { if (paths == null) { localRocksDbDirectories = null; } else if (paths.length == 0) { throw new IllegalArgumentException("empty paths"); } else { File[] pp = new File[paths.length]; for (int i = 0; i < paths.length; i++) { final String rawPath = paths[i]; final String path; if (rawPath == null) { throw new IllegalArgumentException("null path"); } else { // we need this for backwards compatibility, to allow URIs like 'file:///'... URI uri = null; try { uri = new Path(rawPath).toUri(); } catch (Exception e) { // cannot parse as a path } if (uri != null && uri.getScheme() != null) { if ("file".equalsIgnoreCase(uri.getScheme())) { path = uri.getPath(); } else { throw new IllegalArgumentException( "Path " + rawPath + " has a non-local scheme"); } } else { path = rawPath; } } pp[i] = new File(path); if (!pp[i].isAbsolute()) { throw new IllegalArgumentException("Relative paths are not supported"); } } localRocksDbDirectories = pp; } } /** * Gets the configured local DB storage paths, or null, if none were configured. * *

Under these directories on the TaskManager, RocksDB stores its SST files and metadata * files. These directories do not need to be persistent, they can be ephermeral, meaning that * they are lost on a machine failure, because state in RocksDB is persisted in checkpoints. * *

If nothing is configured, these directories default to the TaskManager's local temporary * file directories. */ public String[] getDbStoragePaths() { if (localRocksDbDirectories == null) { return null; } else { String[] paths = new String[localRocksDbDirectories.length]; for (int i = 0; i < paths.length; i++) { paths[i] = localRocksDbDirectories[i].toString(); } return paths; } } /** * Gets whether incremental checkpoints are enabled for this state backend. */ public boolean isIncrementalCheckpointsEnabled() { return enableIncrementalCheckpointing.getOrDefault( CheckpointingOptions.INCREMENTAL_CHECKPOINTS.defaultValue()); } /** * Gets the type of the priority queue state. It will fallback to the default value, if it is * not explicitly set. * * @return The type of the priority queue state. */ public PriorityQueueStateType getPriorityQueueStateType() { return priorityQueueStateType == null ? TIMER_SERVICE_FACTORY.defaultValue() : priorityQueueStateType; } /** * Sets the type of the priority queue state. It will fallback to the default value, if it is * not explicitly set. */ public void setPriorityQueueStateType(PriorityQueueStateType priorityQueueStateType) { this.priorityQueueStateType = checkNotNull(priorityQueueStateType); } // ------------------------------------------------------------------------ // Parametrize with RocksDB Options // ------------------------------------------------------------------------ /** * Sets the predefined options for RocksDB. * *

If user-configured options within {@link RocksDBConfigurableOptions} is set (through * flink-conf.yaml) or a user-defined options factory is set (via {@link * #setRocksDBOptions(RocksDBOptionsFactory)}), then the options from the factory are applied on * top of the here specified predefined options and customized options. * * @param options The options to set (must not be null). */ public void setPredefinedOptions(@Nonnull PredefinedOptions options) { predefinedOptions = checkNotNull(options); } /** * Gets the currently set predefined options for RocksDB. The default options (if nothing was * set via {@link #setPredefinedOptions(PredefinedOptions)}) are {@link * PredefinedOptions#DEFAULT}. * *

If user-configured options within {@link RocksDBConfigurableOptions} is set (through * flink-conf.yaml) of a user-defined options factory is set (via {@link * #setRocksDBOptions(RocksDBOptionsFactory)}), then the options from the factory are applied on * top of the predefined and customized options. * * @return The currently set predefined options for RocksDB. */ @VisibleForTesting public PredefinedOptions getPredefinedOptions() { if (predefinedOptions == null) { predefinedOptions = PredefinedOptions.DEFAULT; } return predefinedOptions; } /** * Sets {@link org.rocksdb.Options} for the RocksDB instances. Because the options are not * serializable and hold native code references, they must be specified through a factory. * *

The options created by the factory here are applied on top of the pre-defined options * profile selected via {@link #setPredefinedOptions(PredefinedOptions)}. If the pre-defined * options profile is the default ({@link PredefinedOptions#DEFAULT}), then the factory fully * controls the RocksDB options. * * @param optionsFactory The options factory that lazily creates the RocksDB options. */ public void setRocksDBOptions(RocksDBOptionsFactory optionsFactory) { this.rocksDbOptionsFactory = optionsFactory; } /** * Gets {@link org.rocksdb.Options} for the RocksDB instances. * *

The options created by the factory here are applied on top of the pre-defined options * profile selected via {@link #setPredefinedOptions(PredefinedOptions)}. If the pre-defined * options profile is the default ({@link PredefinedOptions#DEFAULT}), then the factory fully * controls the RocksDB options. */ @Nullable public RocksDBOptionsFactory getRocksDBOptions() { return rocksDbOptionsFactory; } /** * Gets the number of threads used to transfer files while snapshotting/restoring. */ public int getNumberOfTransferThreads() { return numberOfTransferThreads == UNDEFINED_NUMBER_OF_TRANSFER_THREADS ? CHECKPOINT_TRANSFER_THREAD_NUM.defaultValue() : numberOfTransferThreads; } /** * Sets the number of threads used to transfer files while snapshotting/restoring. * * @param numberOfTransferThreads The number of threads used to transfer files while * snapshotting/restoring. */ public void setNumberOfTransferThreads(int numberOfTransferThreads) { Preconditions.checkArgument( numberOfTransferThreads > 0, "The number of threads used to transfer files in RocksDBStateBackend should be greater than zero."); this.numberOfTransferThreads = numberOfTransferThreads; } /** * @deprecated Typo in method name. Use {@link #getNumberOfTransferThreads} instead. */ @Deprecated public int getNumberOfTransferingThreads() { return getNumberOfTransferThreads(); } /** * @deprecated Typo in method name. Use {@link #setNumberOfTransferThreads(int)} instead. */ @Deprecated public void setNumberOfTransferingThreads(int numberOfTransferingThreads) { setNumberOfTransferThreads(numberOfTransferingThreads); } /** * Gets the max batch size will be used in {@link RocksDBWriteBatchWrapper}. */ public long getWriteBatchSize() { return writeBatchSize == UNDEFINED_WRITE_BATCH_SIZE ? WRITE_BATCH_SIZE.defaultValue().getBytes() : writeBatchSize; } /** * Sets the max batch size will be used in {@link RocksDBWriteBatchWrapper}, no positive value * will disable memory size controller, just use item count controller. * * @param writeBatchSize The size will used to be used in {@link RocksDBWriteBatchWrapper}. */ public void setWriteBatchSize(long writeBatchSize) { checkArgument(writeBatchSize >= 0, "Write batch size have to be no negative."); this.writeBatchSize = writeBatchSize; } // ------------------------------------------------------------------------ // utilities // ------------------------------------------------------------------------ @VisibleForTesting RocksDBResourceContainer createOptionsAndResourceContainer() { return createOptionsAndResourceContainer(null); } @VisibleForTesting private RocksDBResourceContainer createOptionsAndResourceContainer( @Nullable OpaqueMemoryResource sharedResources) { return new RocksDBResourceContainer( predefinedOptions != null ? predefinedOptions : PredefinedOptions.DEFAULT, rocksDbOptionsFactory, sharedResources); } @Override public String toString() { return "RocksDBStateBackend{" + "checkpointStreamBackend=" + checkpointStreamBackend + ", localRocksDbDirectories=" + Arrays.toString(localRocksDbDirectories) + ", enableIncrementalCheckpointing=" + enableIncrementalCheckpointing + ", numberOfTransferThreads=" + numberOfTransferThreads + ", writeBatchSize=" + writeBatchSize + '}'; } // ------------------------------------------------------------------------ // static library loading utilities // ------------------------------------------------------------------------ @VisibleForTesting static void ensureRocksDBIsLoaded(String tempDirectory) throws IOException { synchronized (RocksDBStateBackend.class) { if (!rocksDbInitialized) { final File tempDirParent = new File(tempDirectory).getAbsoluteFile(); LOG.info( "Attempting to load RocksDB native library and store it under '{}'", tempDirParent); Throwable lastException = null; for (int attempt = 1; attempt <= ROCKSDB_LIB_LOADING_ATTEMPTS; attempt++) { File rocksLibFolder = null; try { // when multiple instances of this class and RocksDB exist in different // class loaders, then we can see the following exception: // "java.lang.UnsatisfiedLinkError: Native Library // /path/to/temp/dir/librocksdbjni-linux64.so // already loaded in another class loader" // to avoid that, we need to add a random element to the library file path // (I know, seems like an unnecessary hack, since the JVM obviously can // handle multiple // instances of the same JNI library being loaded in different class // loaders, but // apparently not when coming from the same file path, so there we go) rocksLibFolder = new File(tempDirParent, "rocksdb-lib-" + new AbstractID()); // make sure the temp path exists LOG.debug( "Attempting to create RocksDB native library folder {}", rocksLibFolder); // noinspection ResultOfMethodCallIgnored rocksLibFolder.mkdirs(); // explicitly load the JNI dependency if it has not been loaded before NativeLibraryLoader.getInstance() .loadLibrary(rocksLibFolder.getAbsolutePath()); // this initialization here should validate that the loading succeeded RocksDB.loadLibrary(); // seems to have worked LOG.info("Successfully loaded RocksDB native library"); rocksDbInitialized = true; return; } catch (Throwable t) { lastException = t; LOG.debug("RocksDB JNI library loading attempt {} failed", attempt, t); // try to force RocksDB to attempt reloading the library try { resetRocksDBLoadedFlag(); } catch (Throwable tt) { LOG.debug( "Failed to reset 'initialized' flag in RocksDB native code loader", tt); } FileUtils.deleteDirectoryQuietly(rocksLibFolder); } } throw new IOException("Could not load the native RocksDB library", lastException); } } } @VisibleForTesting static void resetRocksDBLoadedFlag() throws Exception { final Field initField = NativeLibraryLoader.class.getDeclaredField("initialized"); initField.setAccessible(true); initField.setBoolean(null, false); } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.12/org/apache/flink/contrib/streaming/state/restore/RocksDBFullRestoreOperation.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.contrib.streaming.state.restore; import org.apache.flink.api.common.typeutils.base.array.BytePrimitiveArraySerializer; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackend.RocksDbKvStateInfo; import org.apache.flink.contrib.streaming.state.RocksDBNativeMetricOptions; import org.apache.flink.contrib.streaming.state.RocksDBWriteBatchWrapper; import org.apache.flink.contrib.streaming.state.ttl.RocksDbTtlCompactFiltersManager; import org.apache.flink.core.fs.CloseableRegistry; import org.apache.flink.core.fs.FSDataInputStream; import org.apache.flink.core.memory.DataInputView; import org.apache.flink.core.memory.DataInputViewStreamWrapper; import org.apache.flink.metrics.MetricGroup; import org.apache.flink.runtime.state.*; import org.apache.flink.runtime.state.metainfo.StateMetaInfoSnapshot; import org.apache.flink.runtime.state.metainfo.StateMetaInfoSnapshot.BackendStateType; import org.apache.flink.util.IOUtils; import org.apache.flink.util.Preconditions; import org.apache.flink.util.StateMigrationException; import org.rocksdb.ColumnFamilyHandle; import org.rocksdb.ColumnFamilyOptions; import org.rocksdb.DBOptions; import org.rocksdb.RocksDBException; import javax.annotation.Nonnegative; import javax.annotation.Nonnull; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.lang.reflect.Field; import java.math.BigDecimal; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.function.Function; import static org.apache.flink.contrib.streaming.state.snapshot.RocksSnapshotUtil.END_OF_KEY_GROUP_MARK; import static org.apache.flink.contrib.streaming.state.snapshot.RocksSnapshotUtil.clearMetaDataFollowsFlag; import static org.apache.flink.contrib.streaming.state.snapshot.RocksSnapshotUtil.hasMetaDataFollowsFlag; import static org.apache.flink.runtime.state.StateUtil.unexpectedStateHandleException; import static org.apache.flink.util.Preconditions.checkArgument; /** Encapsulates the process of restoring a RocksDB instance from a full snapshot. */ public class RocksDBFullRestoreOperation extends AbstractRocksDBRestoreOperation { /** Current key-groups state handle from which we restore key-groups. */ private KeyGroupsStateHandle currentKeyGroupsStateHandle; /** Current input stream we obtained from currentKeyGroupsStateHandle. */ private FSDataInputStream currentStateHandleInStream; /** Current data input view that wraps currentStateHandleInStream. */ private DataInputView currentStateHandleInView; /** * Current list of ColumnFamilyHandles for all column families we restore from * currentKeyGroupsStateHandle. */ private List currentStateHandleKVStateColumnFamilies; /** * The compression decorator that was used for writing the state, as determined by the meta * data. */ private StreamCompressionDecorator keygroupStreamCompressionDecorator; /** Write batch size used in {@link RocksDBWriteBatchWrapper}. */ private final long writeBatchSize; private final PriorityQueueFlag queueRestoreEnabled; public RocksDBFullRestoreOperation( KeyGroupRange keyGroupRange, int keyGroupPrefixBytes, int numberOfTransferringThreads, CloseableRegistry cancelStreamRegistry, ClassLoader userCodeClassLoader, Map kvStateInformation, StateSerializerProvider keySerializerProvider, File instanceBasePath, File instanceRocksDBPath, DBOptions dbOptions, Function columnFamilyOptionsFactory, RocksDBNativeMetricOptions nativeMetricOptions, MetricGroup metricGroup, @Nonnull Collection restoreStateHandles, @Nonnull RocksDbTtlCompactFiltersManager ttlCompactFiltersManager, @Nonnegative long writeBatchSize, Long writeBufferManagerCapacity, PriorityQueueFlag queueRestoreEnabled) { super( keyGroupRange, keyGroupPrefixBytes, numberOfTransferringThreads, cancelStreamRegistry, userCodeClassLoader, kvStateInformation, keySerializerProvider, instanceBasePath, instanceRocksDBPath, dbOptions, columnFamilyOptionsFactory, nativeMetricOptions, metricGroup, restoreStateHandles, ttlCompactFiltersManager, writeBufferManagerCapacity); checkArgument(writeBatchSize >= 0, "Write batch size have to be no negative."); this.writeBatchSize = writeBatchSize; this.queueRestoreEnabled = queueRestoreEnabled; } /** Restores all key-groups data that is referenced by the passed state handles. */ @Override public RocksDBRestoreResult restore() throws IOException, StateMigrationException, RocksDBException { openDB(); for (KeyedStateHandle keyedStateHandle : restoreStateHandles) { if (keyedStateHandle != null) { if (!(keyedStateHandle instanceof KeyGroupsStateHandle)) { throw unexpectedStateHandleException( KeyGroupsStateHandle.class, keyedStateHandle.getClass()); } this.currentKeyGroupsStateHandle = (KeyGroupsStateHandle) keyedStateHandle; restoreKeyGroupsInStateHandle(); } } return new RocksDBRestoreResult( this.db, defaultColumnFamilyHandle, nativeMetricMonitor, -1, null, null); } /** Restore one key groups state handle. */ private void restoreKeyGroupsInStateHandle() throws IOException, StateMigrationException, RocksDBException { try { logger.info("Starting to restore from state handle: {}.", currentKeyGroupsStateHandle); // TODO: ------------ start:二次开发代码 --------------- // long startRestore = System.currentTimeMillis(); // TODO: ------------ end:二次开发代码 --------------- // currentStateHandleInStream = currentKeyGroupsStateHandle.openInputStream(); cancelStreamRegistry.registerCloseable(currentStateHandleInStream); currentStateHandleInView = new DataInputViewStreamWrapper(currentStateHandleInStream); restoreKVStateMetaData(); restoreKVStateData(); // TODO: ------------ start:二次开发代码 --------------- // logger.info("Finished restoring from state handle: {}, Elapsed:{}ms.", currentKeyGroupsStateHandle, System.currentTimeMillis() - startRestore); // TODO: ------------ end:二次开发代码 --------------- // } finally { if (cancelStreamRegistry.unregisterCloseable(currentStateHandleInStream)) { IOUtils.closeQuietly(currentStateHandleInStream); } } } /** * Restore the KV-state / ColumnFamily meta data for all key-groups referenced by the current * state handle. */ private void restoreKVStateMetaData() throws IOException, StateMigrationException { // TODO: ------------ start:二次开发代码 --------------- // logger.info("Starting to restore KV state metadata."); long startRestoreMetaData = System.currentTimeMillis(); // TODO: ------------ end:二次开发代码 --------------- // KeyedBackendSerializationProxy serializationProxy = readMetaData(currentStateHandleInView); this.keygroupStreamCompressionDecorator = serializationProxy.isUsingKeyGroupCompression() ? SnappyStreamCompressionDecorator.INSTANCE : UncompressedStreamCompressionDecorator.INSTANCE; List restoredMetaInfos = serializationProxy.getStateMetaInfoSnapshots(); currentStateHandleKVStateColumnFamilies = new ArrayList<>(restoredMetaInfos.size()); // TODO: ------------ start:二次开发代码 --------------- // double index = 0; // TODO: ------------ end:二次开发代码 --------------- // for (StateMetaInfoSnapshot restoredMetaInfo : restoredMetaInfos) { if (restoredMetaInfo.getBackendStateType() == BackendStateType.PRIORITY_QUEUE && queueRestoreEnabled == PriorityQueueFlag.THROW_ON_PRIORITY_QUEUE) { throw new StateMigrationException( "Can not restore savepoint taken with RocksDB timers enabled with Heap timers!"); } // TODO: ------------ start:二次开发代码 --------------- // index += 1; long start = System.currentTimeMillis(); // TODO: ------------ end:二次开发代码 --------------- // RocksDbKvStateInfo registeredStateCFHandle = getOrRegisterStateColumnFamilyHandle(null, restoredMetaInfo); currentStateHandleKVStateColumnFamilies.add(registeredStateCFHandle.columnFamilyHandle); // TODO: ------------ start:二次开发代码 --------------- // BigDecimal progress = new BigDecimal(((index / restoredMetaInfos.size()) * 100)).setScale(2, BigDecimal.ROUND_HALF_UP); logger.info("Restore KV state metadata progress: {}/{}({}%), Elapsed:{}ms.", (int) index, restoredMetaInfos.size(), progress, System.currentTimeMillis() - start); // TODO: ------------ end:二次开发代码 --------------- // } // TODO: ------------ start:二次开发代码 --------------- // logger.info("Finished restoring KV state metadata, total:{}, Elapsed:{}ms.", (int) index, System.currentTimeMillis() - startRestoreMetaData); // TODO: ------------ end:二次开发代码 --------------- // } /** * Restore the KV-state / ColumnFamily data for all key-groups referenced by the current state * handle. */ private void restoreKVStateData() throws IOException, RocksDBException { // TODO: ------------ start:二次开发代码 --------------- // logger.info("Starting to restore KV state data."); long startRestoreData = System.currentTimeMillis(); double index = 0; // TODO: ------------ end:二次开发代码 --------------- // // for all key-groups in the current state handle... try (RocksDBWriteBatchWrapper writeBatchWrapper = new RocksDBWriteBatchWrapper(db, writeBatchSize)) { // TODO: ------------ start:二次开发代码 --------------- // long listSize = 100; try { // 通过反射获取进度的总计,用于日志打印总进度 KeyGroupRangeOffsets keyGroupRangeOffsets = currentKeyGroupsStateHandle.getGroupRangeOffsets(); Field offsetsFiled = keyGroupRangeOffsets.getClass().getDeclaredField("offsets"); offsetsFiled.setAccessible(true); long[] offsets = (long[]) offsetsFiled.get(keyGroupRangeOffsets); if (offsets != null && offsets.length != 0) listSize = offsets.length; } catch (Exception e) { logger.warn("获取进度总计失败,对任务无任何影响", e); } // TODO: ------------ end:二次开发代码 --------------- // for (Tuple2 keyGroupOffset : currentKeyGroupsStateHandle.getGroupRangeOffsets()) { int keyGroup = keyGroupOffset.f0; index += 1; long start = System.currentTimeMillis(); // Check that restored key groups all belong to the backend Preconditions.checkState( keyGroupRange.contains(keyGroup), "The key group must belong to the backend"); long offset = keyGroupOffset.f1; // not empty key-group? if (0L != offset) { currentStateHandleInStream.seek(offset); try (InputStream compressedKgIn = keygroupStreamCompressionDecorator.decorateWithCompression( currentStateHandleInStream)) { DataInputViewStreamWrapper compressedKgInputView = new DataInputViewStreamWrapper(compressedKgIn); // TODO this could be aware of keyGroupPrefixBytes and write only one byte // if possible int kvStateId = compressedKgInputView.readShort(); ColumnFamilyHandle handle = currentStateHandleKVStateColumnFamilies.get(kvStateId); // insert all k/v pairs into DB boolean keyGroupHasMoreKeys = true; while (keyGroupHasMoreKeys) { byte[] key = BytePrimitiveArraySerializer.INSTANCE.deserialize( compressedKgInputView); byte[] value = BytePrimitiveArraySerializer.INSTANCE.deserialize( compressedKgInputView); if (hasMetaDataFollowsFlag(key)) { // clear the signal bit in the key to make it ready for insertion // again clearMetaDataFollowsFlag(key); writeBatchWrapper.put(handle, key, value); // TODO this could be aware of keyGroupPrefixBytes and write only // one byte if possible kvStateId = END_OF_KEY_GROUP_MARK & compressedKgInputView.readShort(); if (END_OF_KEY_GROUP_MARK == kvStateId) { keyGroupHasMoreKeys = false; } else { handle = currentStateHandleKVStateColumnFamilies.get(kvStateId); } } else { writeBatchWrapper.put(handle, key, value); } } } } BigDecimal progress = new BigDecimal((index / listSize) * 100).setScale(2, BigDecimal.ROUND_HALF_UP); // TODO: ------------ start:二次开发代码 --------------- // logger.info("Restore KV state data progress: {}/{}({}%), Elapsed:{}ms.", (int) index, listSize, progress, System.currentTimeMillis() - start); // TODO: ------------ end:二次开发代码 --------------- // } } // TODO: ------------ start:二次开发代码 --------------- // logger.info("Finished restoring KV state data, total:{}, Elapsed:{}ms.", (int) index, System.currentTimeMillis() - startRestoreData); // TODO: ------------ end:二次开发代码 --------------- // } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.12/org/apache/flink/runtime/checkpoint/CheckpointCoordinator.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.runtime.checkpoint; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.JobID; import org.apache.flink.api.common.JobStatus; import org.apache.flink.runtime.checkpoint.CheckpointType.PostCheckpointAction; import org.apache.flink.runtime.checkpoint.hooks.MasterHooks; import org.apache.flink.runtime.concurrent.FutureUtils; import org.apache.flink.runtime.concurrent.ScheduledExecutor; import org.apache.flink.runtime.execution.ExecutionState; import org.apache.flink.runtime.executiongraph.*; import org.apache.flink.runtime.jobgraph.JobVertexID; import org.apache.flink.runtime.jobgraph.OperatorID; import org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration; import org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint; import org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint; import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; import org.apache.flink.runtime.operators.coordination.OperatorInfo; import org.apache.flink.runtime.state.*; import org.apache.flink.runtime.state.memory.ByteStreamStateHandle; import org.apache.flink.util.ExceptionUtils; import org.apache.flink.util.FlinkRuntimeException; import org.apache.flink.util.Preconditions; import org.apache.flink.util.StringUtils; import org.apache.flink.util.clock.Clock; import org.apache.flink.util.clock.SystemClock; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import javax.annotation.concurrent.GuardedBy; import java.io.IOException; import java.util.*; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Predicate; import static org.apache.flink.util.ExceptionUtils.findThrowable; import static org.apache.flink.util.Preconditions.checkArgument; import static org.apache.flink.util.Preconditions.checkNotNull; /** * The checkpoint coordinator coordinates the distributed snapshots of operators and state. It * triggers the checkpoint by sending the messages to the relevant tasks and collects the checkpoint * acknowledgements. It also collects and maintains the overview of the state handles reported by * the tasks that acknowledge the checkpoint. */ public class CheckpointCoordinator { private static final Logger LOG = LoggerFactory.getLogger(CheckpointCoordinator.class); /** The number of recent checkpoints whose IDs are remembered. */ private static final int NUM_GHOST_CHECKPOINT_IDS = 16; // ------------------------------------------------------------------------ /** Coordinator-wide lock to safeguard the checkpoint updates. */ private final Object lock = new Object(); /** The job whose checkpoint this coordinator coordinates. */ private final JobID job; /** Default checkpoint properties. * */ private final CheckpointProperties checkpointProperties; /** The executor used for asynchronous calls, like potentially blocking I/O. */ private final Executor executor; private final CheckpointsCleaner checkpointsCleaner; /** Tasks who need to be sent a message when a checkpoint is started. */ private final ExecutionVertex[] tasksToTrigger; /** Tasks who need to acknowledge a checkpoint before it succeeds. */ private final ExecutionVertex[] tasksToWaitFor; /** Tasks who need to be sent a message when a checkpoint is confirmed. */ // TODO currently we use commit vertices to receive "abort checkpoint" messages. private final ExecutionVertex[] tasksToCommitTo; /** The operator coordinators that need to be checkpointed. */ private final Collection coordinatorsToCheckpoint; /** Map from checkpoint ID to the pending checkpoint. */ @GuardedBy("lock") private final Map pendingCheckpoints; /** * Completed checkpoints. Implementations can be blocking. Make sure calls to methods accessing * this don't block the job manager actor and run asynchronously. */ private final CompletedCheckpointStore completedCheckpointStore; /** * The root checkpoint state backend, which is responsible for initializing the checkpoint, * storing the metadata, and cleaning up the checkpoint. */ private final CheckpointStorageCoordinatorView checkpointStorage; /** A list of recent checkpoint IDs, to identify late messages (vs invalid ones). */ private final ArrayDeque recentPendingCheckpoints; /** * Checkpoint ID counter to ensure ascending IDs. In case of job manager failures, these need to * be ascending across job managers. */ private final CheckpointIDCounter checkpointIdCounter; // TODO: ------------ start:二次开发代码 --------------- // /** * The base checkpoint interval. Actual trigger time may be affected by the max concurrent * checkpoints and minimum-pause values */ private long baseInterval; /** The max time (in ms) that a checkpoint may take. */ private long checkpointTimeout; /** * The min time(in ms) to delay after a checkpoint could be triggered. Allows to enforce minimum * processing time between checkpoint attempts */ private long minPauseBetweenCheckpoints; public long getBaseInterval() { return baseInterval; } public void setBaseInterval(long baseInterval) { this.baseInterval = baseInterval; } public void setCheckpointTimeout(long checkpointTimeout) { this.checkpointTimeout = checkpointTimeout; } public long getMinPauseBetweenCheckpoints() { return minPauseBetweenCheckpoints; } public void setMinPauseBetweenCheckpoints(long minPauseBetweenCheckpoints) { this.minPauseBetweenCheckpoints = minPauseBetweenCheckpoints; } private static CheckpointCoordinator coordinator; public static CheckpointCoordinator getInstance() { return CheckpointCoordinator.coordinator; } // TODO: ------------ end:二次开发代码 ----------------- // /** * The timer that handles the checkpoint timeouts and triggers periodic checkpoints. It must be * single-threaded. Eventually it will be replaced by main thread executor. */ private final ScheduledExecutor timer; /** The master checkpoint hooks executed by this checkpoint coordinator. */ private final HashMap> masterHooks; private final boolean unalignedCheckpointsEnabled; private final long alignmentTimeout; /** Actor that receives status updates from the execution graph this coordinator works for. */ private JobStatusListener jobStatusListener; /** The number of consecutive failed trigger attempts. */ private final AtomicInteger numUnsuccessfulCheckpointsTriggers = new AtomicInteger(0); /** A handle to the current periodic trigger, to cancel it when necessary. */ private ScheduledFuture currentPeriodicTrigger; /** * The timestamp (via {@link Clock#relativeTimeMillis()}) when the last checkpoint completed. */ private long lastCheckpointCompletionRelativeTime; /** * Flag whether a triggered checkpoint should immediately schedule the next checkpoint. * Non-volatile, because only accessed in synchronized scope */ private boolean periodicScheduling; /** Flag marking the coordinator as shut down (not accepting any messages any more). */ private volatile boolean shutdown; /** Optional tracker for checkpoint statistics. */ @Nullable private CheckpointStatsTracker statsTracker; /** A factory for SharedStateRegistry objects. */ private final SharedStateRegistryFactory sharedStateRegistryFactory; /** Registry that tracks state which is shared across (incremental) checkpoints. */ private SharedStateRegistry sharedStateRegistry; private boolean isPreferCheckpointForRecovery; private final CheckpointFailureManager failureManager; private final Clock clock; private final boolean isExactlyOnceMode; /** Flag represents there is an in-flight trigger request. */ private boolean isTriggering = false; private final CheckpointRequestDecider requestDecider; // -------------------------------------------------------------------------------------------- public CheckpointCoordinator( JobID job, CheckpointCoordinatorConfiguration chkConfig, ExecutionVertex[] tasksToTrigger, ExecutionVertex[] tasksToWaitFor, ExecutionVertex[] tasksToCommitTo, Collection coordinatorsToCheckpoint, CheckpointIDCounter checkpointIDCounter, CompletedCheckpointStore completedCheckpointStore, StateBackend checkpointStateBackend, Executor executor, CheckpointsCleaner checkpointsCleaner, ScheduledExecutor timer, SharedStateRegistryFactory sharedStateRegistryFactory, CheckpointFailureManager failureManager) { this( job, chkConfig, tasksToTrigger, tasksToWaitFor, tasksToCommitTo, coordinatorsToCheckpoint, checkpointIDCounter, completedCheckpointStore, checkpointStateBackend, executor, checkpointsCleaner, timer, sharedStateRegistryFactory, failureManager, SystemClock.getInstance()); } @VisibleForTesting public CheckpointCoordinator( JobID job, CheckpointCoordinatorConfiguration chkConfig, ExecutionVertex[] tasksToTrigger, ExecutionVertex[] tasksToWaitFor, ExecutionVertex[] tasksToCommitTo, Collection coordinatorsToCheckpoint, CheckpointIDCounter checkpointIDCounter, CompletedCheckpointStore completedCheckpointStore, StateBackend checkpointStateBackend, Executor executor, CheckpointsCleaner checkpointsCleaner, ScheduledExecutor timer, SharedStateRegistryFactory sharedStateRegistryFactory, CheckpointFailureManager failureManager, Clock clock) { // sanity checks checkNotNull(checkpointStateBackend); // max "in between duration" can be one year - this is to prevent numeric overflows long minPauseBetweenCheckpoints = chkConfig.getMinPauseBetweenCheckpoints(); if (minPauseBetweenCheckpoints > 365L * 24 * 60 * 60 * 1_000) { minPauseBetweenCheckpoints = 365L * 24 * 60 * 60 * 1_000; } // it does not make sense to schedule checkpoints more often then the desired // time between checkpoints long baseInterval = chkConfig.getCheckpointInterval(); if (baseInterval < minPauseBetweenCheckpoints) { baseInterval = minPauseBetweenCheckpoints; } this.job = checkNotNull(job); this.baseInterval = baseInterval; this.checkpointTimeout = chkConfig.getCheckpointTimeout(); this.minPauseBetweenCheckpoints = minPauseBetweenCheckpoints; this.tasksToTrigger = checkNotNull(tasksToTrigger); this.tasksToWaitFor = checkNotNull(tasksToWaitFor); this.tasksToCommitTo = checkNotNull(tasksToCommitTo); this.coordinatorsToCheckpoint = Collections.unmodifiableCollection(coordinatorsToCheckpoint); this.pendingCheckpoints = new LinkedHashMap<>(); this.checkpointIdCounter = checkNotNull(checkpointIDCounter); this.completedCheckpointStore = checkNotNull(completedCheckpointStore); this.executor = checkNotNull(executor); this.checkpointsCleaner = checkNotNull(checkpointsCleaner); this.sharedStateRegistryFactory = checkNotNull(sharedStateRegistryFactory); this.sharedStateRegistry = sharedStateRegistryFactory.create(executor); this.isPreferCheckpointForRecovery = chkConfig.isPreferCheckpointForRecovery(); this.failureManager = checkNotNull(failureManager); this.clock = checkNotNull(clock); this.isExactlyOnceMode = chkConfig.isExactlyOnce(); this.unalignedCheckpointsEnabled = chkConfig.isUnalignedCheckpointsEnabled(); this.alignmentTimeout = chkConfig.getAlignmentTimeout(); this.recentPendingCheckpoints = new ArrayDeque<>(NUM_GHOST_CHECKPOINT_IDS); this.masterHooks = new HashMap<>(); this.timer = timer; this.checkpointProperties = CheckpointProperties.forCheckpoint(chkConfig.getCheckpointRetentionPolicy()); try { this.checkpointStorage = checkpointStateBackend.createCheckpointStorage(job); checkpointStorage.initializeBaseLocations(); } catch (IOException e) { throw new FlinkRuntimeException( "Failed to create checkpoint storage at checkpoint coordinator side.", e); } try { // Make sure the checkpoint ID enumerator is running. Possibly // issues a blocking call to ZooKeeper. checkpointIDCounter.start(); } catch (Throwable t) { throw new RuntimeException( "Failed to start checkpoint ID counter: " + t.getMessage(), t); } this.requestDecider = new CheckpointRequestDecider( chkConfig.getMaxConcurrentCheckpoints(), this::rescheduleTrigger, this.clock, this.minPauseBetweenCheckpoints, this.pendingCheckpoints::size, this.checkpointsCleaner::getNumberOfCheckpointsToClean); // TODO: ------------ start:二次开发代码 --------------- // CheckpointCoordinator.coordinator = this; // TODO: ------------ end:二次开发代码 --------------- // } // -------------------------------------------------------------------------------------------- // Configuration // -------------------------------------------------------------------------------------------- /** * Adds the given master hook to the checkpoint coordinator. This method does nothing, if the * checkpoint coordinator already contained a hook with the same ID (as defined via {@link * MasterTriggerRestoreHook#getIdentifier()}). * * @param hook The hook to add. * @return True, if the hook was added, false if the checkpoint coordinator already contained a * hook with the same ID. */ public boolean addMasterHook(MasterTriggerRestoreHook hook) { checkNotNull(hook); final String id = hook.getIdentifier(); checkArgument(!StringUtils.isNullOrWhitespaceOnly(id), "The hook has a null or empty id"); synchronized (lock) { if (!masterHooks.containsKey(id)) { masterHooks.put(id, hook); return true; } else { return false; } } } /** Gets the number of currently register master hooks. */ public int getNumberOfRegisteredMasterHooks() { synchronized (lock) { return masterHooks.size(); } } /** * Sets the checkpoint stats tracker. * * @param statsTracker The checkpoint stats tracker. */ public void setCheckpointStatsTracker(@Nullable CheckpointStatsTracker statsTracker) { this.statsTracker = statsTracker; } // -------------------------------------------------------------------------------------------- // Clean shutdown // -------------------------------------------------------------------------------------------- /** * Shuts down the checkpoint coordinator. * *

After this method has been called, the coordinator does not accept and further messages * and cannot trigger any further checkpoints. */ public void shutdown(JobStatus jobStatus) throws Exception { synchronized (lock) { if (!shutdown) { shutdown = true; LOG.info("Stopping checkpoint coordinator for job {}.", job); periodicScheduling = false; // shut down the hooks MasterHooks.close(masterHooks.values(), LOG); masterHooks.clear(); final CheckpointException reason = new CheckpointException( CheckpointFailureReason.CHECKPOINT_COORDINATOR_SHUTDOWN); // clear queued requests and in-flight checkpoints abortPendingAndQueuedCheckpoints(reason); completedCheckpointStore.shutdown( jobStatus, checkpointsCleaner, () -> { // don't schedule anything on shutdown }); checkpointIdCounter.shutdown(jobStatus); } } } public boolean isShutdown() { return shutdown; } // -------------------------------------------------------------------------------------------- // Triggering Checkpoints and Savepoints // -------------------------------------------------------------------------------------------- /** * Triggers a savepoint with the given savepoint directory as a target. * * @param targetLocation Target location for the savepoint, optional. If null, the state * backend's configured default will be used. * @return A future to the completed checkpoint * @throws IllegalStateException If no savepoint directory has been specified and no default * savepoint directory has been configured */ public CompletableFuture triggerSavepoint( @Nullable final String targetLocation) { final CheckpointProperties properties = CheckpointProperties.forSavepoint(!unalignedCheckpointsEnabled); return triggerSavepointInternal(properties, targetLocation); } /** * Triggers a synchronous savepoint with the given savepoint directory as a target. * * @param terminate flag indicating if the job should terminate or just suspend * @param targetLocation Target location for the savepoint, optional. If null, the state * backend's configured default will be used. * @return A future to the completed checkpoint * @throws IllegalStateException If no savepoint directory has been specified and no default * savepoint directory has been configured */ public CompletableFuture triggerSynchronousSavepoint( final boolean terminate, @Nullable final String targetLocation) { final CheckpointProperties properties = CheckpointProperties.forSyncSavepoint(!unalignedCheckpointsEnabled, terminate); return triggerSavepointInternal(properties, targetLocation); } private CompletableFuture triggerSavepointInternal( final CheckpointProperties checkpointProperties, @Nullable final String targetLocation) { checkNotNull(checkpointProperties); // TODO, call triggerCheckpoint directly after removing timer thread // for now, execute the trigger in timer thread to avoid competition final CompletableFuture resultFuture = new CompletableFuture<>(); timer.execute( () -> triggerCheckpoint(checkpointProperties, targetLocation, false) .whenComplete( (completedCheckpoint, throwable) -> { if (throwable == null) { resultFuture.complete(completedCheckpoint); } else { resultFuture.completeExceptionally(throwable); } })); return resultFuture; } /** * Triggers a new standard checkpoint and uses the given timestamp as the checkpoint timestamp. * The return value is a future. It completes when the checkpoint triggered finishes or an error * occurred. * * @param isPeriodic Flag indicating whether this triggered checkpoint is periodic. If this flag * is true, but the periodic scheduler is disabled, the checkpoint will be declined. * @return a future to the completed checkpoint. */ public CompletableFuture triggerCheckpoint(boolean isPeriodic) { return triggerCheckpoint(checkpointProperties, null, isPeriodic); } @VisibleForTesting public CompletableFuture triggerCheckpoint( CheckpointProperties props, @Nullable String externalSavepointLocation, boolean isPeriodic) { if (props.getCheckpointType().getPostCheckpointAction() == PostCheckpointAction.TERMINATE && !(props.isSynchronous() && props.isSavepoint())) { return FutureUtils.completedExceptionally( new IllegalArgumentException( "Only synchronous savepoints are allowed to advance the watermark to MAX.")); } CheckpointTriggerRequest request = new CheckpointTriggerRequest(props, externalSavepointLocation, isPeriodic); chooseRequestToExecute(request).ifPresent(this::startTriggeringCheckpoint); return request.onCompletionPromise; } private void startTriggeringCheckpoint(CheckpointTriggerRequest request) { try { synchronized (lock) { preCheckGlobalState(request.isPeriodic); } final Execution[] executions = getTriggerExecutions(); final Map ackTasks = getAckTasks(); // we will actually trigger this checkpoint! Preconditions.checkState(!isTriggering); isTriggering = true; final long timestamp = System.currentTimeMillis(); final CompletableFuture pendingCheckpointCompletableFuture = initializeCheckpoint(request.props, request.externalSavepointLocation) .thenApplyAsync( (checkpointIdAndStorageLocation) -> createPendingCheckpoint( timestamp, request.props, ackTasks, request.isPeriodic, checkpointIdAndStorageLocation.checkpointId, checkpointIdAndStorageLocation .checkpointStorageLocation, request.getOnCompletionFuture()), timer); final CompletableFuture coordinatorCheckpointsComplete = pendingCheckpointCompletableFuture.thenComposeAsync( (pendingCheckpoint) -> OperatorCoordinatorCheckpoints .triggerAndAcknowledgeAllCoordinatorCheckpointsWithCompletion( coordinatorsToCheckpoint, pendingCheckpoint, timer), timer); // We have to take the snapshot of the master hooks after the coordinator checkpoints // has completed. // This is to ensure the tasks are checkpointed after the OperatorCoordinators in case // ExternallyInducedSource is used. final CompletableFuture masterStatesComplete = coordinatorCheckpointsComplete.thenComposeAsync( ignored -> { // If the code reaches here, the pending checkpoint is guaranteed to // be not null. // We use FutureUtils.getWithoutException() to make compiler happy // with checked // exceptions in the signature. PendingCheckpoint checkpoint = FutureUtils.getWithoutException( pendingCheckpointCompletableFuture); return snapshotMasterState(checkpoint); }, timer); FutureUtils.assertNoException( CompletableFuture.allOf(masterStatesComplete, coordinatorCheckpointsComplete) .handleAsync( (ignored, throwable) -> { final PendingCheckpoint checkpoint = FutureUtils.getWithoutException( pendingCheckpointCompletableFuture); Preconditions.checkState( checkpoint != null || throwable != null, "Either the pending checkpoint needs to be created or an error must have been occurred."); if (throwable != null) { // the initialization might not be finished yet if (checkpoint == null) { onTriggerFailure(request, throwable); } else { onTriggerFailure(checkpoint, throwable); } } else { if (checkpoint.isDisposed()) { onTriggerFailure( checkpoint, new CheckpointException( CheckpointFailureReason .TRIGGER_CHECKPOINT_FAILURE, checkpoint.getFailureCause())); } else { // no exception, no discarding, everything is OK final long checkpointId = checkpoint.getCheckpointId(); snapshotTaskState( timestamp, checkpointId, checkpoint.getCheckpointStorageLocation(), request.props, executions); coordinatorsToCheckpoint.forEach( (ctx) -> ctx.afterSourceBarrierInjection( checkpointId)); // It is possible that the tasks has finished // checkpointing at this point. // So we need to complete this pending checkpoint. if (!maybeCompleteCheckpoint(checkpoint)) { return null; } onTriggerSuccess(); } } return null; }, timer) .exceptionally( error -> { if (!isShutdown()) { throw new CompletionException(error); } else if (findThrowable( error, RejectedExecutionException.class) .isPresent()) { LOG.debug("Execution rejected during shutdown"); } else { LOG.warn("Error encountered during shutdown", error); } return null; })); } catch (Throwable throwable) { onTriggerFailure(request, throwable); } } /** * Initialize the checkpoint trigger asynchronously. It will be executed in io thread due to it * might be time-consuming. * * @param props checkpoint properties * @param externalSavepointLocation the external savepoint location, it might be null * @return the future of initialized result, checkpoint id and checkpoint location */ private CompletableFuture initializeCheckpoint( CheckpointProperties props, @Nullable String externalSavepointLocation) { return CompletableFuture.supplyAsync( () -> { try { // this must happen outside the coordinator-wide lock, because it // communicates // with external services (in HA mode) and may block for a while. long checkpointID = checkpointIdCounter.getAndIncrement(); CheckpointStorageLocation checkpointStorageLocation = props.isSavepoint() ? checkpointStorage.initializeLocationForSavepoint( checkpointID, externalSavepointLocation) : checkpointStorage.initializeLocationForCheckpoint( checkpointID); return new CheckpointIdAndStorageLocation( checkpointID, checkpointStorageLocation); } catch (Throwable throwable) { throw new CompletionException(throwable); } }, executor); } private PendingCheckpoint createPendingCheckpoint( long timestamp, CheckpointProperties props, Map ackTasks, boolean isPeriodic, long checkpointID, CheckpointStorageLocation checkpointStorageLocation, CompletableFuture onCompletionPromise) { synchronized (lock) { try { // since we haven't created the PendingCheckpoint yet, we need to check the // global state here. preCheckGlobalState(isPeriodic); } catch (Throwable t) { throw new CompletionException(t); } } final PendingCheckpoint checkpoint = new PendingCheckpoint( job, checkpointID, timestamp, ackTasks, OperatorInfo.getIds(coordinatorsToCheckpoint), masterHooks.keySet(), props, checkpointStorageLocation, onCompletionPromise); if (statsTracker != null) { PendingCheckpointStats callback = statsTracker.reportPendingCheckpoint(checkpointID, timestamp, props); checkpoint.setStatsCallback(callback); } synchronized (lock) { pendingCheckpoints.put(checkpointID, checkpoint); ScheduledFuture cancellerHandle = timer.schedule( new CheckpointCanceller(checkpoint), checkpointTimeout, TimeUnit.MILLISECONDS); if (!checkpoint.setCancellerHandle(cancellerHandle)) { // checkpoint is already disposed! cancellerHandle.cancel(false); } } LOG.info( "Triggering checkpoint {} (type={}) @ {} for job {}.", checkpointID, checkpoint.getProps().getCheckpointType(), timestamp, job); return checkpoint; } /** * Snapshot master hook states asynchronously. * * @param checkpoint the pending checkpoint * @return the future represents master hook states are finished or not */ private CompletableFuture snapshotMasterState(PendingCheckpoint checkpoint) { if (masterHooks.isEmpty()) { return CompletableFuture.completedFuture(null); } final long checkpointID = checkpoint.getCheckpointId(); final long timestamp = checkpoint.getCheckpointTimestamp(); final CompletableFuture masterStateCompletableFuture = new CompletableFuture<>(); for (MasterTriggerRestoreHook masterHook : masterHooks.values()) { MasterHooks.triggerHook(masterHook, checkpointID, timestamp, executor) .whenCompleteAsync( (masterState, throwable) -> { try { synchronized (lock) { if (masterStateCompletableFuture.isDone()) { return; } if (checkpoint.isDisposed()) { throw new IllegalStateException( "Checkpoint " + checkpointID + " has been discarded"); } if (throwable == null) { checkpoint.acknowledgeMasterState( masterHook.getIdentifier(), masterState); if (checkpoint.areMasterStatesFullyAcknowledged()) { masterStateCompletableFuture.complete(null); } } else { masterStateCompletableFuture.completeExceptionally( throwable); } } } catch (Throwable t) { masterStateCompletableFuture.completeExceptionally(t); } }, timer); } return masterStateCompletableFuture; } /** * Snapshot task state. * * @param timestamp the timestamp of this checkpoint reques * @param checkpointID the checkpoint id * @param checkpointStorageLocation the checkpoint location * @param props the checkpoint properties * @param executions the executions which should be triggered * @param advanceToEndOfTime Flag indicating if the source should inject a {@code MAX_WATERMARK} * in the pipeline to fire any registered event-time timers. */ private void snapshotTaskState( long timestamp, long checkpointID, CheckpointStorageLocation checkpointStorageLocation, CheckpointProperties props, Execution[] executions) { final CheckpointOptions checkpointOptions = CheckpointOptions.create( props.getCheckpointType(), checkpointStorageLocation.getLocationReference(), isExactlyOnceMode, unalignedCheckpointsEnabled, alignmentTimeout); // send the messages to the tasks that trigger their checkpoint for (Execution execution : executions) { if (props.isSynchronous()) { execution.triggerSynchronousSavepoint(checkpointID, timestamp, checkpointOptions); } else { execution.triggerCheckpoint(checkpointID, timestamp, checkpointOptions); } } } /** Trigger request is successful. NOTE, it must be invoked if trigger request is successful. */ private void onTriggerSuccess() { isTriggering = false; numUnsuccessfulCheckpointsTriggers.set(0); executeQueuedRequest(); } /** * The trigger request is failed prematurely without a proper initialization. There is no * resource to release, but the completion promise needs to fail manually here. * * @param onCompletionPromise the completion promise of the checkpoint/savepoint * @param throwable the reason of trigger failure */ private void onTriggerFailure( CheckpointTriggerRequest onCompletionPromise, Throwable throwable) { final CheckpointException checkpointException = getCheckpointException( CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE, throwable); onCompletionPromise.completeExceptionally(checkpointException); onTriggerFailure((PendingCheckpoint) null, checkpointException); } /** * The trigger request is failed. NOTE, it must be invoked if trigger request is failed. * * @param checkpoint the pending checkpoint which is failed. It could be null if it's failed * prematurely without a proper initialization. * @param throwable the reason of trigger failure */ private void onTriggerFailure(@Nullable PendingCheckpoint checkpoint, Throwable throwable) { // beautify the stack trace a bit throwable = ExceptionUtils.stripCompletionException(throwable); try { coordinatorsToCheckpoint.forEach( OperatorCoordinatorCheckpointContext::abortCurrentTriggering); if (checkpoint != null && !checkpoint.isDisposed()) { int numUnsuccessful = numUnsuccessfulCheckpointsTriggers.incrementAndGet(); LOG.warn( "Failed to trigger checkpoint {} for job {}. ({} consecutive failed attempts so far)", checkpoint.getCheckpointId(), job, numUnsuccessful, throwable); final CheckpointException cause = getCheckpointException( CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE, throwable); synchronized (lock) { abortPendingCheckpoint(checkpoint, cause); } } } finally { isTriggering = false; executeQueuedRequest(); } } private void executeQueuedRequest() { chooseQueuedRequestToExecute().ifPresent(this::startTriggeringCheckpoint); } private Optional chooseQueuedRequestToExecute() { synchronized (lock) { return requestDecider.chooseQueuedRequestToExecute( isTriggering, lastCheckpointCompletionRelativeTime); } } private Optional chooseRequestToExecute( CheckpointTriggerRequest request) { synchronized (lock) { return requestDecider.chooseRequestToExecute( request, isTriggering, lastCheckpointCompletionRelativeTime); } } // Returns true if the checkpoint is successfully completed, false otherwise. private boolean maybeCompleteCheckpoint(PendingCheckpoint checkpoint) { synchronized (lock) { if (checkpoint.isFullyAcknowledged()) { try { // we need to check inside the lock for being shutdown as well, // otherwise we get races and invalid error log messages. if (shutdown) { return false; } completePendingCheckpoint(checkpoint); } catch (CheckpointException ce) { onTriggerFailure(checkpoint, ce); return false; } } } return true; } // -------------------------------------------------------------------------------------------- // Handling checkpoints and messages // -------------------------------------------------------------------------------------------- /** * Receives a {@link DeclineCheckpoint} message for a pending checkpoint. * * @param message Checkpoint decline from the task manager * @param taskManagerLocationInfo The location info of the decline checkpoint message's sender */ public void receiveDeclineMessage(DeclineCheckpoint message, String taskManagerLocationInfo) { if (shutdown || message == null) { return; } if (!job.equals(message.getJob())) { throw new IllegalArgumentException( "Received DeclineCheckpoint message for job " + message.getJob() + " from " + taskManagerLocationInfo + " while this coordinator handles job " + job); } final long checkpointId = message.getCheckpointId(); final CheckpointException checkpointException = message.getSerializedCheckpointException().unwrap(); final String reason = checkpointException.getMessage(); PendingCheckpoint checkpoint; synchronized (lock) { // we need to check inside the lock for being shutdown as well, otherwise we // get races and invalid error log messages if (shutdown) { return; } checkpoint = pendingCheckpoints.get(checkpointId); if (checkpoint != null) { Preconditions.checkState( !checkpoint.isDisposed(), "Received message for discarded but non-removed checkpoint " + checkpointId); LOG.info( "Decline checkpoint {} by task {} of job {} at {}.", checkpointId, message.getTaskExecutionId(), job, taskManagerLocationInfo, checkpointException.getCause()); abortPendingCheckpoint( checkpoint, checkpointException, message.getTaskExecutionId()); } else if (LOG.isDebugEnabled()) { if (recentPendingCheckpoints.contains(checkpointId)) { // message is for an unknown checkpoint, or comes too late (checkpoint disposed) LOG.debug( "Received another decline message for now expired checkpoint attempt {} from task {} of job {} at {} : {}", checkpointId, message.getTaskExecutionId(), job, taskManagerLocationInfo, reason); } else { // message is for an unknown checkpoint. might be so old that we don't even // remember it any more LOG.debug( "Received decline message for unknown (too old?) checkpoint attempt {} from task {} of job {} at {} : {}", checkpointId, message.getTaskExecutionId(), job, taskManagerLocationInfo, reason); } } } } /** * Receives an AcknowledgeCheckpoint message and returns whether the message was associated with * a pending checkpoint. * * @param message Checkpoint ack from the task manager * @param taskManagerLocationInfo The location of the acknowledge checkpoint message's sender * @return Flag indicating whether the ack'd checkpoint was associated with a pending * checkpoint. * @throws CheckpointException If the checkpoint cannot be added to the completed checkpoint * store. */ public boolean receiveAcknowledgeMessage( AcknowledgeCheckpoint message, String taskManagerLocationInfo) throws CheckpointException { if (shutdown || message == null) { return false; } if (!job.equals(message.getJob())) { LOG.error( "Received wrong AcknowledgeCheckpoint message for job {} from {} : {}", job, taskManagerLocationInfo, message); return false; } final long checkpointId = message.getCheckpointId(); synchronized (lock) { // we need to check inside the lock for being shutdown as well, otherwise we // get races and invalid error log messages if (shutdown) { return false; } final PendingCheckpoint checkpoint = pendingCheckpoints.get(checkpointId); if (checkpoint != null && !checkpoint.isDisposed()) { switch (checkpoint.acknowledgeTask( message.getTaskExecutionId(), message.getSubtaskState(), message.getCheckpointMetrics())) { case SUCCESS: LOG.debug( "Received acknowledge message for checkpoint {} from task {} of job {} at {}.", checkpointId, message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); if (checkpoint.isFullyAcknowledged()) { completePendingCheckpoint(checkpoint); } break; case DUPLICATE: LOG.debug( "Received a duplicate acknowledge message for checkpoint {}, task {}, job {}, location {}.", message.getCheckpointId(), message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); break; case UNKNOWN: LOG.warn( "Could not acknowledge the checkpoint {} for task {} of job {} at {}, " + "because the task's execution attempt id was unknown. Discarding " + "the state handle to avoid lingering state.", message.getCheckpointId(), message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); discardSubtaskState( message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState()); break; case DISCARDED: LOG.warn( "Could not acknowledge the checkpoint {} for task {} of job {} at {}, " + "because the pending checkpoint had been discarded. Discarding the " + "state handle tp avoid lingering state.", message.getCheckpointId(), message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); discardSubtaskState( message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState()); } return true; } else if (checkpoint != null) { // this should not happen throw new IllegalStateException( "Received message for discarded but non-removed checkpoint " + checkpointId); } else { boolean wasPendingCheckpoint; // message is for an unknown checkpoint, or comes too late (checkpoint disposed) if (recentPendingCheckpoints.contains(checkpointId)) { wasPendingCheckpoint = true; LOG.warn( "Received late message for now expired checkpoint attempt {} from task " + "{} of job {} at {}.", checkpointId, message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); } else { LOG.debug( "Received message for an unknown checkpoint {} from task {} of job {} at {}.", checkpointId, message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); wasPendingCheckpoint = false; } // try to discard the state so that we don't have lingering state lying around discardSubtaskState( message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState()); return wasPendingCheckpoint; } } } /** * Try to complete the given pending checkpoint. * *

Important: This method should only be called in the checkpoint lock scope. * * @param pendingCheckpoint to complete * @throws CheckpointException if the completion failed */ private void completePendingCheckpoint(PendingCheckpoint pendingCheckpoint) throws CheckpointException { final long checkpointId = pendingCheckpoint.getCheckpointId(); final CompletedCheckpoint completedCheckpoint; // As a first step to complete the checkpoint, we register its state with the registry Map operatorStates = pendingCheckpoint.getOperatorStates(); sharedStateRegistry.registerAll(operatorStates.values()); try { try { completedCheckpoint = pendingCheckpoint.finalizeCheckpoint( checkpointsCleaner, this::scheduleTriggerRequest, executor); failureManager.handleCheckpointSuccess(pendingCheckpoint.getCheckpointId()); } catch (Exception e1) { // abort the current pending checkpoint if we fails to finalize the pending // checkpoint. if (!pendingCheckpoint.isDisposed()) { abortPendingCheckpoint( pendingCheckpoint, new CheckpointException( CheckpointFailureReason.FINALIZE_CHECKPOINT_FAILURE, e1)); } throw new CheckpointException( "Could not finalize the pending checkpoint " + checkpointId + '.', CheckpointFailureReason.FINALIZE_CHECKPOINT_FAILURE, e1); } // the pending checkpoint must be discarded after the finalization Preconditions.checkState(pendingCheckpoint.isDisposed() && completedCheckpoint != null); try { completedCheckpointStore.addCheckpoint( completedCheckpoint, checkpointsCleaner, this::scheduleTriggerRequest); } catch (Exception exception) { // we failed to store the completed checkpoint. Let's clean up executor.execute( new Runnable() { @Override public void run() { try { completedCheckpoint.discardOnFailedStoring(); } catch (Throwable t) { LOG.warn( "Could not properly discard completed checkpoint {}.", completedCheckpoint.getCheckpointID(), t); } } }); sendAbortedMessages(checkpointId, pendingCheckpoint.getCheckpointTimestamp()); throw new CheckpointException( "Could not complete the pending checkpoint " + checkpointId + '.', CheckpointFailureReason.FINALIZE_CHECKPOINT_FAILURE, exception); } } finally { pendingCheckpoints.remove(checkpointId); scheduleTriggerRequest(); } rememberRecentCheckpointId(checkpointId); // drop those pending checkpoints that are at prior to the completed one dropSubsumedCheckpoints(checkpointId); // record the time when this was completed, to calculate // the 'min delay between checkpoints' lastCheckpointCompletionRelativeTime = clock.relativeTimeMillis(); LOG.info( "Completed checkpoint {} for job {} ({} bytes in {} ms).", checkpointId, job, completedCheckpoint.getStateSize(), completedCheckpoint.getDuration()); if (LOG.isDebugEnabled()) { StringBuilder builder = new StringBuilder(); builder.append("Checkpoint state: "); for (OperatorState state : completedCheckpoint.getOperatorStates().values()) { builder.append(state); builder.append(", "); } // Remove last two chars ", " builder.setLength(builder.length() - 2); LOG.debug(builder.toString()); } // send the "notify complete" call to all vertices, coordinators, etc. sendAcknowledgeMessages(checkpointId, completedCheckpoint.getTimestamp()); } void scheduleTriggerRequest() { synchronized (lock) { if (isShutdown()) { LOG.debug( "Skip scheduling trigger request because the CheckpointCoordinator is shut down"); } else { timer.execute(this::executeQueuedRequest); } } } private void sendAcknowledgeMessages(long checkpointId, long timestamp) { // commit tasks for (ExecutionVertex ev : tasksToCommitTo) { Execution ee = ev.getCurrentExecutionAttempt(); if (ee != null) { ee.notifyCheckpointComplete(checkpointId, timestamp); } } // commit coordinators for (OperatorCoordinatorCheckpointContext coordinatorContext : coordinatorsToCheckpoint) { coordinatorContext.notifyCheckpointComplete(checkpointId); } } private void sendAbortedMessages(long checkpointId, long timeStamp) { // send notification of aborted checkpoints asynchronously. executor.execute( () -> { // send the "abort checkpoint" messages to necessary vertices. for (ExecutionVertex ev : tasksToCommitTo) { Execution ee = ev.getCurrentExecutionAttempt(); if (ee != null) { ee.notifyCheckpointAborted(checkpointId, timeStamp); } } }); // commit coordinators for (OperatorCoordinatorCheckpointContext coordinatorContext : coordinatorsToCheckpoint) { coordinatorContext.notifyCheckpointAborted(checkpointId); } } /** * Fails all pending checkpoints which have not been acknowledged by the given execution attempt * id. * * @param executionAttemptId for which to discard unacknowledged pending checkpoints * @param cause of the failure */ public void failUnacknowledgedPendingCheckpointsFor( ExecutionAttemptID executionAttemptId, Throwable cause) { synchronized (lock) { abortPendingCheckpoints( checkpoint -> !checkpoint.isAcknowledgedBy(executionAttemptId), new CheckpointException(CheckpointFailureReason.TASK_FAILURE, cause)); } } private void rememberRecentCheckpointId(long id) { if (recentPendingCheckpoints.size() >= NUM_GHOST_CHECKPOINT_IDS) { recentPendingCheckpoints.removeFirst(); } recentPendingCheckpoints.addLast(id); } private void dropSubsumedCheckpoints(long checkpointId) { abortPendingCheckpoints( checkpoint -> checkpoint.getCheckpointId() < checkpointId && checkpoint.canBeSubsumed(), new CheckpointException(CheckpointFailureReason.CHECKPOINT_SUBSUMED)); } // -------------------------------------------------------------------------------------------- // Checkpoint State Restoring // -------------------------------------------------------------------------------------------- /** * Restores the latest checkpointed state. * * @param tasks Map of job vertices to restore. State for these vertices is restored via {@link * Execution#setInitialState(JobManagerTaskRestore)}. * @param errorIfNoCheckpoint Fail if no completed checkpoint is available to restore from. * @param allowNonRestoredState Allow checkpoint state that cannot be mapped to any job vertex * in tasks. * @return true if state was restored, false otherwise. * @throws IllegalStateException If the CheckpointCoordinator is shut down. * @throws IllegalStateException If no completed checkpoint is available and the * failIfNoCheckpoint flag has been set. * @throws IllegalStateException If the checkpoint contains state that cannot be mapped to any * job vertex in tasks and the allowNonRestoredState flag has not * been set. * @throws IllegalStateException If the max parallelism changed for an operator that restores * state from this checkpoint. * @throws IllegalStateException If the parallelism changed for an operator that restores * non-partitioned state from this checkpoint. */ @Deprecated public boolean restoreLatestCheckpointedState( Map tasks, boolean errorIfNoCheckpoint, boolean allowNonRestoredState) throws Exception { final OptionalLong restoredCheckpointId = restoreLatestCheckpointedStateInternal( new HashSet<>(tasks.values()), OperatorCoordinatorRestoreBehavior.RESTORE_OR_RESET, errorIfNoCheckpoint, allowNonRestoredState); return restoredCheckpointId.isPresent(); } /** * Restores the latest checkpointed state to a set of subtasks. This method represents a "local" * or "regional" failover and does restore states to coordinators. Note that a regional failover * might still include all tasks. * * @param tasks Set of job vertices to restore. State for these vertices is restored via {@link * Execution#setInitialState(JobManagerTaskRestore)}. * @return An {@code OptionalLong} with the checkpoint ID, if state was restored, an empty * {@code OptionalLong} otherwise. * @throws IllegalStateException If the CheckpointCoordinator is shut down. * @throws IllegalStateException If no completed checkpoint is available and the * failIfNoCheckpoint flag has been set. * @throws IllegalStateException If the checkpoint contains state that cannot be mapped to any * job vertex in tasks and the allowNonRestoredState flag has not * been set. * @throws IllegalStateException If the max parallelism changed for an operator that restores * state from this checkpoint. * @throws IllegalStateException If the parallelism changed for an operator that restores * non-partitioned state from this checkpoint. */ public OptionalLong restoreLatestCheckpointedStateToSubtasks( final Set tasks) throws Exception { // when restoring subtasks only we accept potentially unmatched state for the // following reasons // - the set frequently does not include all Job Vertices (only the ones that are part // of the restarted region), meaning there will be unmatched state by design. // - because what we might end up restoring from an original savepoint with unmatched // state, if there is was no checkpoint yet. return restoreLatestCheckpointedStateInternal( tasks, OperatorCoordinatorRestoreBehavior .SKIP, // local/regional recovery does not reset coordinators false, // recovery might come before first successful checkpoint true); // see explanation above } /** * Restores the latest checkpointed state to all tasks and all coordinators. This method * represents a "global restore"-style operation where all stateful tasks and coordinators from * the given set of Job Vertices are restored. are restored to their latest checkpointed state. * * @param tasks Set of job vertices to restore. State for these vertices is restored via {@link * Execution#setInitialState(JobManagerTaskRestore)}. * @param allowNonRestoredState Allow checkpoint state that cannot be mapped to any job vertex * in tasks. * @return true if state was restored, false otherwise. * @throws IllegalStateException If the CheckpointCoordinator is shut down. * @throws IllegalStateException If no completed checkpoint is available and the * failIfNoCheckpoint flag has been set. * @throws IllegalStateException If the checkpoint contains state that cannot be mapped to any * job vertex in tasks and the allowNonRestoredState flag has not * been set. * @throws IllegalStateException If the max parallelism changed for an operator that restores * state from this checkpoint. * @throws IllegalStateException If the parallelism changed for an operator that restores * non-partitioned state from this checkpoint. */ public boolean restoreLatestCheckpointedStateToAll( final Set tasks, final boolean allowNonRestoredState) throws Exception { final OptionalLong restoredCheckpointId = restoreLatestCheckpointedStateInternal( tasks, OperatorCoordinatorRestoreBehavior .RESTORE_OR_RESET, // global recovery restores coordinators, or // resets them to empty false, // recovery might come before first successful checkpoint allowNonRestoredState); return restoredCheckpointId.isPresent(); } /** * Restores the latest checkpointed at the beginning of the job execution. If there is a * checkpoint, this method acts like a "global restore"-style operation where all stateful tasks * and coordinators from the given set of Job Vertices are restored. * * @param tasks Set of job vertices to restore. State for these vertices is restored via {@link * Execution#setInitialState(JobManagerTaskRestore)}. * @return True, if a checkpoint was found and its state was restored, false otherwise. */ public boolean restoreInitialCheckpointIfPresent(final Set tasks) throws Exception { final OptionalLong restoredCheckpointId = restoreLatestCheckpointedStateInternal( tasks, OperatorCoordinatorRestoreBehavior.RESTORE_IF_CHECKPOINT_PRESENT, false, // initial checkpoints exist only on JobManager failover. ok if not // present. false); // JobManager failover means JobGraphs match exactly. return restoredCheckpointId.isPresent(); } /** * Performs the actual restore operation to the given tasks. * *

This method returns the restored checkpoint ID (as an optional) or an empty optional, if * no checkpoint was restored. */ private OptionalLong restoreLatestCheckpointedStateInternal( final Set tasks, final OperatorCoordinatorRestoreBehavior operatorCoordinatorRestoreBehavior, final boolean errorIfNoCheckpoint, final boolean allowNonRestoredState) throws Exception { synchronized (lock) { if (shutdown) { throw new IllegalStateException("CheckpointCoordinator is shut down"); } // We create a new shared state registry object, so that all pending async disposal // requests from previous // runs will go against the old object (were they can do no harm). // This must happen under the checkpoint lock. sharedStateRegistry.close(); sharedStateRegistry = sharedStateRegistryFactory.create(executor); // Recover the checkpoints, TODO this could be done only when there is a new leader, not // on each recovery completedCheckpointStore.recover(); // Now, we re-register all (shared) states from the checkpoint store with the new // registry for (CompletedCheckpoint completedCheckpoint : completedCheckpointStore.getAllCheckpoints()) { completedCheckpoint.registerSharedStatesAfterRestored(sharedStateRegistry); } LOG.debug( "Status of the shared state registry of job {} after restore: {}.", job, sharedStateRegistry); // Restore from the latest checkpoint CompletedCheckpoint latest = completedCheckpointStore.getLatestCheckpoint(isPreferCheckpointForRecovery); if (latest == null) { LOG.info("No checkpoint found during restore."); if (errorIfNoCheckpoint) { throw new IllegalStateException("No completed checkpoint available"); } LOG.debug("Resetting the master hooks."); MasterHooks.reset(masterHooks.values(), LOG); if (operatorCoordinatorRestoreBehavior == OperatorCoordinatorRestoreBehavior.RESTORE_OR_RESET) { // we let the JobManager-side components know that there was a recovery, // even if there was no checkpoint to recover from, yet LOG.info("Resetting the Operator Coordinators to an empty state."); restoreStateToCoordinators( OperatorCoordinator.NO_CHECKPOINT, Collections.emptyMap()); } return OptionalLong.empty(); } LOG.info("Restoring job {} from {}.", job, latest); // re-assign the task states final Map operatorStates = latest.getOperatorStates(); StateAssignmentOperation stateAssignmentOperation = new StateAssignmentOperation( latest.getCheckpointID(), tasks, operatorStates, allowNonRestoredState); stateAssignmentOperation.assignStates(); // call master hooks for restore. we currently call them also on "regional restore" // because // there is no other failure notification mechanism in the master hooks // ultimately these should get removed anyways in favor of the operator coordinators MasterHooks.restoreMasterHooks( masterHooks, latest.getMasterHookStates(), latest.getCheckpointID(), allowNonRestoredState, LOG); if (operatorCoordinatorRestoreBehavior != OperatorCoordinatorRestoreBehavior.SKIP) { restoreStateToCoordinators(latest.getCheckpointID(), operatorStates); } // update metrics if (statsTracker != null) { long restoreTimestamp = System.currentTimeMillis(); RestoredCheckpointStats restored = new RestoredCheckpointStats( latest.getCheckpointID(), latest.getProperties(), restoreTimestamp, latest.getExternalPointer()); statsTracker.reportRestoredCheckpoint(restored); } return OptionalLong.of(latest.getCheckpointID()); } } /** * Restore the state with given savepoint. * * @param savepointPointer The pointer to the savepoint. * @param allowNonRestored True if allowing checkpoint state that cannot be mapped to any job * vertex in tasks. * @param tasks Map of job vertices to restore. State for these vertices is restored via {@link * Execution#setInitialState(JobManagerTaskRestore)}. * @param userClassLoader The class loader to resolve serialized classes in legacy savepoint * versions. */ public boolean restoreSavepoint( String savepointPointer, boolean allowNonRestored, Map tasks, ClassLoader userClassLoader) throws Exception { Preconditions.checkNotNull(savepointPointer, "The savepoint path cannot be null."); LOG.info( "Starting job {} from savepoint {} ({})", job, savepointPointer, (allowNonRestored ? "allowing non restored state" : "")); final CompletedCheckpointStorageLocation checkpointLocation = checkpointStorage.resolveCheckpoint(savepointPointer); // Load the savepoint as a checkpoint into the system CompletedCheckpoint savepoint = Checkpoints.loadAndValidateCheckpoint( job, tasks, checkpointLocation, userClassLoader, allowNonRestored); completedCheckpointStore.addCheckpoint( savepoint, checkpointsCleaner, this::scheduleTriggerRequest); // Reset the checkpoint ID counter long nextCheckpointId = savepoint.getCheckpointID() + 1; checkpointIdCounter.setCount(nextCheckpointId); LOG.info("Reset the checkpoint ID of job {} to {}.", job, nextCheckpointId); final OptionalLong restoredCheckpointId = restoreLatestCheckpointedStateInternal( new HashSet<>(tasks.values()), OperatorCoordinatorRestoreBehavior.RESTORE_IF_CHECKPOINT_PRESENT, true, allowNonRestored); return restoredCheckpointId.isPresent(); } // ------------------------------------------------------------------------ // Accessors // ------------------------------------------------------------------------ public int getNumberOfPendingCheckpoints() { synchronized (lock) { return this.pendingCheckpoints.size(); } } public int getNumberOfRetainedSuccessfulCheckpoints() { synchronized (lock) { return completedCheckpointStore.getNumberOfRetainedCheckpoints(); } } public Map getPendingCheckpoints() { synchronized (lock) { return new HashMap<>(this.pendingCheckpoints); } } public List getSuccessfulCheckpoints() throws Exception { synchronized (lock) { return completedCheckpointStore.getAllCheckpoints(); } } public CheckpointStorageCoordinatorView getCheckpointStorage() { return checkpointStorage; } public CompletedCheckpointStore getCheckpointStore() { return completedCheckpointStore; } public long getCheckpointTimeout() { return checkpointTimeout; } /** @deprecated use {@link #getNumQueuedRequests()} */ @Deprecated @VisibleForTesting PriorityQueue getTriggerRequestQueue() { synchronized (lock) { return requestDecider.getTriggerRequestQueue(); } } public boolean isTriggering() { return isTriggering; } @VisibleForTesting boolean isCurrentPeriodicTriggerAvailable() { return currentPeriodicTrigger != null; } /** * Returns whether periodic checkpointing has been configured. * * @return true if periodic checkpoints have been configured. */ public boolean isPeriodicCheckpointingConfigured() { return baseInterval != Long.MAX_VALUE; } // -------------------------------------------------------------------------------------------- // Periodic scheduling of checkpoints // -------------------------------------------------------------------------------------------- public void startCheckpointScheduler() { synchronized (lock) { if (shutdown) { throw new IllegalArgumentException("Checkpoint coordinator is shut down"); } // make sure all prior timers are cancelled stopCheckpointScheduler(); periodicScheduling = true; currentPeriodicTrigger = scheduleTriggerWithDelay(getRandomInitDelay()); } } public void stopCheckpointScheduler() { synchronized (lock) { periodicScheduling = false; cancelPeriodicTrigger(); final CheckpointException reason = new CheckpointException(CheckpointFailureReason.CHECKPOINT_COORDINATOR_SUSPEND); abortPendingAndQueuedCheckpoints(reason); numUnsuccessfulCheckpointsTriggers.set(0); } } /** * Aborts all the pending checkpoints due to en exception. * * @param exception The exception. */ public void abortPendingCheckpoints(CheckpointException exception) { synchronized (lock) { abortPendingCheckpoints(ignored -> true, exception); } } private void abortPendingCheckpoints( Predicate checkpointToFailPredicate, CheckpointException exception) { assert Thread.holdsLock(lock); final PendingCheckpoint[] pendingCheckpointsToFail = pendingCheckpoints.values().stream() .filter(checkpointToFailPredicate) .toArray(PendingCheckpoint[]::new); // do not traverse pendingCheckpoints directly, because it might be changed during // traversing for (PendingCheckpoint pendingCheckpoint : pendingCheckpointsToFail) { abortPendingCheckpoint(pendingCheckpoint, exception); } } private void rescheduleTrigger(long tillNextMillis) { cancelPeriodicTrigger(); currentPeriodicTrigger = scheduleTriggerWithDelay(tillNextMillis); } private void cancelPeriodicTrigger() { if (currentPeriodicTrigger != null) { currentPeriodicTrigger.cancel(false); currentPeriodicTrigger = null; } } private long getRandomInitDelay() { return ThreadLocalRandom.current().nextLong(minPauseBetweenCheckpoints, baseInterval + 1L); } private ScheduledFuture scheduleTriggerWithDelay(long initDelay) { return timer.scheduleAtFixedRate( new ScheduledTrigger(), initDelay, baseInterval, TimeUnit.MILLISECONDS); } private void restoreStateToCoordinators( final long checkpointId, final Map operatorStates) throws Exception { for (OperatorCoordinatorCheckpointContext coordContext : coordinatorsToCheckpoint) { final OperatorState state = operatorStates.get(coordContext.operatorId()); final ByteStreamStateHandle coordinatorState = state == null ? null : state.getCoordinatorState(); final byte[] bytes = coordinatorState == null ? null : coordinatorState.getData(); coordContext.resetToCheckpoint(checkpointId, bytes); } } // ------------------------------------------------------------------------ // job status listener that schedules / cancels periodic checkpoints // ------------------------------------------------------------------------ public JobStatusListener createActivatorDeactivator() { synchronized (lock) { if (shutdown) { throw new IllegalArgumentException("Checkpoint coordinator is shut down"); } if (jobStatusListener == null) { jobStatusListener = new CheckpointCoordinatorDeActivator(this); } return jobStatusListener; } } int getNumQueuedRequests() { synchronized (lock) { return requestDecider.getNumQueuedRequests(); } } // ------------------------------------------------------------------------ private final class ScheduledTrigger implements Runnable { @Override public void run() { try { triggerCheckpoint(true); } catch (Exception e) { LOG.error("Exception while triggering checkpoint for job {}.", job, e); } } } /** * Discards the given state object asynchronously belonging to the given job, execution attempt * id and checkpoint id. * * @param jobId identifying the job to which the state object belongs * @param executionAttemptID identifying the task to which the state object belongs * @param checkpointId of the state object * @param subtaskState to discard asynchronously */ private void discardSubtaskState( final JobID jobId, final ExecutionAttemptID executionAttemptID, final long checkpointId, final TaskStateSnapshot subtaskState) { if (subtaskState != null) { executor.execute( new Runnable() { @Override public void run() { try { subtaskState.discardState(); } catch (Throwable t2) { LOG.warn( "Could not properly discard state object of checkpoint {} " + "belonging to task {} of job {}.", checkpointId, executionAttemptID, jobId, t2); } } }); } } private void abortPendingCheckpoint( PendingCheckpoint pendingCheckpoint, CheckpointException exception) { abortPendingCheckpoint(pendingCheckpoint, exception, null); } private void abortPendingCheckpoint( PendingCheckpoint pendingCheckpoint, CheckpointException exception, @Nullable final ExecutionAttemptID executionAttemptID) { assert (Thread.holdsLock(lock)); if (!pendingCheckpoint.isDisposed()) { try { // release resource here pendingCheckpoint.abort( exception.getCheckpointFailureReason(), exception.getCause(), checkpointsCleaner, this::scheduleTriggerRequest, executor); if (pendingCheckpoint.getProps().isSavepoint() && pendingCheckpoint.getProps().isSynchronous()) { failureManager.handleSynchronousSavepointFailure(exception); } else if (executionAttemptID != null) { failureManager.handleTaskLevelCheckpointException( exception, pendingCheckpoint.getCheckpointId(), executionAttemptID); } else { failureManager.handleJobLevelCheckpointException( exception, pendingCheckpoint.getCheckpointId()); } } finally { sendAbortedMessages( pendingCheckpoint.getCheckpointId(), pendingCheckpoint.getCheckpointTimestamp()); pendingCheckpoints.remove(pendingCheckpoint.getCheckpointId()); rememberRecentCheckpointId(pendingCheckpoint.getCheckpointId()); scheduleTriggerRequest(); } } } private void preCheckGlobalState(boolean isPeriodic) throws CheckpointException { // abort if the coordinator has been shutdown in the meantime if (shutdown) { throw new CheckpointException(CheckpointFailureReason.CHECKPOINT_COORDINATOR_SHUTDOWN); } // Don't allow periodic checkpoint if scheduling has been disabled if (isPeriodic && !periodicScheduling) { throw new CheckpointException(CheckpointFailureReason.PERIODIC_SCHEDULER_SHUTDOWN); } } /** * Check if all tasks that we need to trigger are running. If not, abort the checkpoint. * * @return the executions need to be triggered. * @throws CheckpointException the exception fails checking */ private Execution[] getTriggerExecutions() throws CheckpointException { Execution[] executions = new Execution[tasksToTrigger.length]; for (int i = 0; i < tasksToTrigger.length; i++) { Execution ee = tasksToTrigger[i].getCurrentExecutionAttempt(); if (ee == null) { LOG.info( "Checkpoint triggering task {} of job {} is not being executed at the moment. Aborting checkpoint.", tasksToTrigger[i].getTaskNameWithSubtaskIndex(), job); throw new CheckpointException( CheckpointFailureReason.NOT_ALL_REQUIRED_TASKS_RUNNING); } else if (ee.getState() == ExecutionState.RUNNING) { executions[i] = ee; } else { LOG.info( "Checkpoint triggering task {} of job {} is not in state {} but {} instead. Aborting checkpoint.", tasksToTrigger[i].getTaskNameWithSubtaskIndex(), job, ExecutionState.RUNNING, ee.getState()); throw new CheckpointException( CheckpointFailureReason.NOT_ALL_REQUIRED_TASKS_RUNNING); } } return executions; } /** * Check if all tasks that need to acknowledge the checkpoint are running. If not, abort the * checkpoint * * @return the execution vertices which should give an ack response * @throws CheckpointException the exception fails checking */ private Map getAckTasks() throws CheckpointException { Map ackTasks = new HashMap<>(tasksToWaitFor.length); for (ExecutionVertex ev : tasksToWaitFor) { Execution ee = ev.getCurrentExecutionAttempt(); if (ee != null) { ackTasks.put(ee.getAttemptId(), ev); } else { LOG.info( "Checkpoint acknowledging task {} of job {} is not being executed at the moment. Aborting checkpoint.", ev.getTaskNameWithSubtaskIndex(), job); throw new CheckpointException( CheckpointFailureReason.NOT_ALL_REQUIRED_TASKS_RUNNING); } } return ackTasks; } private void abortPendingAndQueuedCheckpoints(CheckpointException exception) { assert (Thread.holdsLock(lock)); requestDecider.abortAll(exception); abortPendingCheckpoints(exception); } /** * The canceller of checkpoint. The checkpoint might be cancelled if it doesn't finish in a * configured period. */ private class CheckpointCanceller implements Runnable { private final PendingCheckpoint pendingCheckpoint; private CheckpointCanceller(PendingCheckpoint pendingCheckpoint) { this.pendingCheckpoint = checkNotNull(pendingCheckpoint); } @Override public void run() { synchronized (lock) { // only do the work if the checkpoint is not discarded anyways // note that checkpoint completion discards the pending checkpoint object if (!pendingCheckpoint.isDisposed()) { LOG.info( "Checkpoint {} of job {} expired before completing.", pendingCheckpoint.getCheckpointId(), job); abortPendingCheckpoint( pendingCheckpoint, new CheckpointException(CheckpointFailureReason.CHECKPOINT_EXPIRED)); } } } } private static CheckpointException getCheckpointException( CheckpointFailureReason defaultReason, Throwable throwable) { final Optional checkpointExceptionOptional = findThrowable(throwable, CheckpointException.class); return checkpointExceptionOptional.orElseGet( () -> new CheckpointException(defaultReason, throwable)); } private static class CheckpointIdAndStorageLocation { private final long checkpointId; private final CheckpointStorageLocation checkpointStorageLocation; CheckpointIdAndStorageLocation( long checkpointId, CheckpointStorageLocation checkpointStorageLocation) { this.checkpointId = checkpointId; this.checkpointStorageLocation = checkNotNull(checkpointStorageLocation); } } static class CheckpointTriggerRequest { final long timestamp; final CheckpointProperties props; final @Nullable String externalSavepointLocation; final boolean isPeriodic; private final CompletableFuture onCompletionPromise = new CompletableFuture<>(); CheckpointTriggerRequest( CheckpointProperties props, @Nullable String externalSavepointLocation, boolean isPeriodic) { this.timestamp = System.currentTimeMillis(); this.props = checkNotNull(props); this.externalSavepointLocation = externalSavepointLocation; this.isPeriodic = isPeriodic; } CompletableFuture getOnCompletionFuture() { return onCompletionPromise; } public void completeExceptionally(CheckpointException exception) { onCompletionPromise.completeExceptionally(exception); } public boolean isForce() { return props.forceCheckpoint(); } } private enum OperatorCoordinatorRestoreBehavior { /** Coordinators are always restored. If there is no checkpoint, they are restored empty. */ RESTORE_OR_RESET, /** Coordinators are restored if there was a checkpoint. */ RESTORE_IF_CHECKPOINT_PRESENT, /** Coordinators are not restored during this checkpoint restore. */ SKIP; } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.12/org/apache/flink/runtime/util/EnvironmentInformation.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.runtime.util; import org.apache.flink.configuration.GlobalConfiguration; import org.apache.flink.util.OperatingSystem; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.InputStream; import java.lang.management.ManagementFactory; import java.lang.management.RuntimeMXBean; import java.lang.reflect.Method; import java.time.Instant; import java.time.ZoneId; import java.time.format.DateTimeFormatter; import java.time.format.DateTimeParseException; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.concurrent.ConcurrentHashMap; /** * Utility class that gives access to the execution environment of the JVM, like the executing user, * startup options, or the JVM version. */ public class EnvironmentInformation { private static final Logger LOG = LoggerFactory.getLogger(EnvironmentInformation.class); public static final String UNKNOWN = ""; // TODO: ------------ start:二次开发代码 --------------- // // 用于判断是否为JobManager private static Boolean IS_JOBMANAGER = true; private static final Map settings = new ConcurrentHashMap<>(); /** * 用不判断当前组件是否为JobManager */ public static boolean isJobManager() { return IS_JOBMANAGER; } /** * 获取配置信息 */ public static Map getSettings() { return settings; } /** * 设置配置信息 */ public static void setSetting(String key, String value) { if (!settings.containsKey(key)) { settings.put(key, value); } } // TODO: ------------ end:二次开发代码 ----------------- // /** * Returns the version of the code as String. * * @return The project version string. */ public static String getVersion() { return getVersionsInstance().projectVersion; } /** * Returns the version of the used Scala compiler as String. * * @return The scala version string. */ public static String getScalaVersion() { return getVersionsInstance().scalaVersion; } /** @return The Instant this version of the software was built. */ public static Instant getBuildTime() { return getVersionsInstance().gitBuildTime; } /** * @return The Instant this version of the software was built as a String using the * Europe/Berlin timezone. */ public static String getBuildTimeString() { return getVersionsInstance().gitBuildTimeStr; } /** @return The last known commit id of this version of the software. */ public static String getGitCommitId() { return getVersionsInstance().gitCommitId; } /** @return The last known abbreviated commit id of this version of the software. */ public static String getGitCommitIdAbbrev() { return getVersionsInstance().gitCommitIdAbbrev; } /** @return The Instant of the last commit of this code. */ public static Instant getGitCommitTime() { return getVersionsInstance().gitCommitTime; } /** * @return The Instant of the last commit of this code as a String using the Europe/Berlin * timezone. */ public static String getGitCommitTimeString() { return getVersionsInstance().gitCommitTimeStr; } /** * Returns the code revision (commit and commit date) of Flink, as generated by the Maven * builds. * * @return The code revision. */ public static RevisionInformation getRevisionInformation() { return new RevisionInformation(getGitCommitIdAbbrev(), getGitCommitTimeString()); } private static final class Versions { private static final Instant DEFAULT_TIME_INSTANT = Instant.EPOCH; private static final String DEFAULT_TIME_STRING = "1970-01-01T00:00:00+0000"; private static final String UNKNOWN_COMMIT_ID = "DecafC0ffeeD0d0F00d"; private static final String UNKNOWN_COMMIT_ID_ABBREV = "DeadD0d0"; private String projectVersion = UNKNOWN; private String scalaVersion = UNKNOWN; private Instant gitBuildTime = DEFAULT_TIME_INSTANT; private String gitBuildTimeStr = DEFAULT_TIME_STRING; private String gitCommitId = UNKNOWN_COMMIT_ID; private String gitCommitIdAbbrev = UNKNOWN_COMMIT_ID_ABBREV; private Instant gitCommitTime = DEFAULT_TIME_INSTANT; private String gitCommitTimeStr = DEFAULT_TIME_STRING; private static final String PROP_FILE = ".flink-runtime.version.properties"; private static final String FAIL_MESSAGE = "The file " + PROP_FILE + " has not been generated correctly. You MUST run 'mvn generate-sources' in the flink-runtime module."; private String getProperty(Properties properties, String key, String defaultValue) { String value = properties.getProperty(key); if (value == null || value.charAt(0) == '$') { return defaultValue; } return value; } public Versions() { ClassLoader classLoader = EnvironmentInformation.class.getClassLoader(); try (InputStream propFile = classLoader.getResourceAsStream(PROP_FILE)) { if (propFile != null) { Properties properties = new Properties(); properties.load(propFile); projectVersion = getProperty(properties, "project.version", UNKNOWN); scalaVersion = getProperty(properties, "scala.binary.version", UNKNOWN); gitCommitId = getProperty(properties, "git.commit.id", UNKNOWN_COMMIT_ID); gitCommitIdAbbrev = getProperty( properties, "git.commit.id.abbrev", UNKNOWN_COMMIT_ID_ABBREV); // This is to reliably parse the datetime format configured in the // git-commit-id-plugin DateTimeFormatter gitDateTimeFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ssZ"); // Default format is in Berlin timezone because that is where Flink originated. DateTimeFormatter berlinDateTime = DateTimeFormatter.ISO_OFFSET_DATE_TIME.withZone( ZoneId.of("Europe/Berlin")); try { String propGitCommitTime = getProperty(properties, "git.commit.time", DEFAULT_TIME_STRING); gitCommitTime = gitDateTimeFormatter.parse(propGitCommitTime, Instant::from); gitCommitTimeStr = berlinDateTime.format(gitCommitTime); String propGitBuildTime = getProperty(properties, "git.build.time", DEFAULT_TIME_STRING); gitBuildTime = gitDateTimeFormatter.parse(propGitBuildTime, Instant::from); gitBuildTimeStr = berlinDateTime.format(gitBuildTime); } catch (DateTimeParseException dtpe) { LOG.error("{} : {}", FAIL_MESSAGE, dtpe); throw new IllegalStateException(FAIL_MESSAGE); } } } catch (IOException ioe) { LOG.info( "Cannot determine code revision: Unable to read version property file.: {}", ioe.getMessage()); } } } private static final class VersionsHolder { static final Versions INSTANCE = new Versions(); } private static Versions getVersionsInstance() { return VersionsHolder.INSTANCE; } /** * Gets the name of the user that is running the JVM. * * @return The name of the user that is running the JVM. */ public static String getHadoopUser() { try { Class ugiClass = Class.forName( "org.apache.hadoop.security.UserGroupInformation", false, EnvironmentInformation.class.getClassLoader()); Method currentUserMethod = ugiClass.getMethod("getCurrentUser"); Method shortUserNameMethod = ugiClass.getMethod("getShortUserName"); Object ugi = currentUserMethod.invoke(null); return (String) shortUserNameMethod.invoke(ugi); } catch (ClassNotFoundException e) { return ""; } catch (LinkageError e) { // hadoop classes are not in the classpath LOG.debug( "Cannot determine user/group information using Hadoop utils. " + "Hadoop classes not loaded or compatible", e); } catch (Throwable t) { // some other error occurred that we should log and make known LOG.warn("Error while accessing user/group information via Hadoop utils.", t); } return UNKNOWN; } /** * The maximum JVM heap size, in bytes. * *

This method uses the -Xmx value of the JVM, if set. If not set, it returns (as a * heuristic) 1/4th of the physical memory size. * * @return The maximum JVM heap size, in bytes. */ public static long getMaxJvmHeapMemory() { final long maxMemory = Runtime.getRuntime().maxMemory(); if (maxMemory != Long.MAX_VALUE) { // we have the proper max memory return maxMemory; } else { // max JVM heap size is not set - use the heuristic to use 1/4th of the physical memory final long physicalMemory = Hardware.getSizeOfPhysicalMemory(); if (physicalMemory != -1) { // got proper value for physical memory return physicalMemory / 4; } else { throw new RuntimeException( "Could not determine the amount of free memory.\n" + "Please set the maximum memory for the JVM, e.g. -Xmx512M for 512 megabytes."); } } } /** * Gets an estimate of the size of the free heap memory. * *

NOTE: This method is heavy-weight. It triggers a garbage collection to reduce * fragmentation and get a better estimate at the size of free memory. It is typically more * accurate than the plain version {@link #getSizeOfFreeHeapMemory()}. * * @return An estimate of the size of the free heap memory, in bytes. */ public static long getSizeOfFreeHeapMemoryWithDefrag() { // trigger a garbage collection, to reduce fragmentation System.gc(); return getSizeOfFreeHeapMemory(); } /** * Gets an estimate of the size of the free heap memory. The estimate may vary, depending on the * current level of memory fragmentation and the number of dead objects. For a better (but more * heavy-weight) estimate, use {@link #getSizeOfFreeHeapMemoryWithDefrag()}. * * @return An estimate of the size of the free heap memory, in bytes. */ public static long getSizeOfFreeHeapMemory() { Runtime r = Runtime.getRuntime(); return getMaxJvmHeapMemory() - r.totalMemory() + r.freeMemory(); } /** * Gets the version of the JVM in the form "VM_Name - Vendor - Spec/Version". * * @return The JVM version. */ public static String getJvmVersion() { try { final RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); return bean.getVmName() + " - " + bean.getVmVendor() + " - " + bean.getSpecVersion() + '/' + bean.getVmVersion(); } catch (Throwable t) { return UNKNOWN; } } /** * Gets the system parameters and environment parameters that were passed to the JVM on startup. * * @return The options passed to the JVM on startup. */ public static String getJvmStartupOptions() { try { final RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); final StringBuilder bld = new StringBuilder(); for (String s : bean.getInputArguments()) { bld.append(s).append(' '); } return bld.toString(); } catch (Throwable t) { return UNKNOWN; } } /** * Gets the system parameters and environment parameters that were passed to the JVM on startup. * * @return The options passed to the JVM on startup. */ public static String[] getJvmStartupOptionsArray() { try { RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); List options = bean.getInputArguments(); return options.toArray(new String[options.size()]); } catch (Throwable t) { return new String[0]; } } /** * Gets the directory for temporary files, as returned by the JVM system property * "java.io.tmpdir". * * @return The directory for temporary files. */ public static String getTemporaryFileDirectory() { return System.getProperty("java.io.tmpdir"); } /** * Tries to retrieve the maximum number of open file handles. This method will only work on * UNIX-based operating systems with Sun/Oracle Java versions. * *

If the number of max open file handles cannot be determined, this method returns {@code * -1}. * * @return The limit of open file handles, or {@code -1}, if the limit could not be determined. */ public static long getOpenFileHandlesLimit() { if (OperatingSystem .isWindows()) { // getMaxFileDescriptorCount method is not available on Windows return -1L; } Class sunBeanClass; try { sunBeanClass = Class.forName("com.sun.management.UnixOperatingSystemMXBean"); } catch (ClassNotFoundException e) { return -1L; } try { Method fhLimitMethod = sunBeanClass.getMethod("getMaxFileDescriptorCount"); Object result = fhLimitMethod.invoke(ManagementFactory.getOperatingSystemMXBean()); return (Long) result; } catch (Throwable t) { LOG.warn("Unexpected error when accessing file handle limit", t); return -1L; } } // TODO: ------------ start:二次开发代码 ---------------- // /** * 解析命令并判断是否为JobManager */ private static void parseCommand(String[] commandLineArgs) { if (commandLineArgs != null) { for (String command : commandLineArgs) { if (command != null && command.length() > 0) { if (command.contains("resource-id")) { IS_JOBMANAGER = false; } if (!"-D".equals(command)) { String[] properties = command.replace("-D", "").split("=", 2); if (properties != null && properties.length == 2 && properties[0] != null && properties[1] != null) { settings.put(properties[0], properties[1]); } } } } } } // TODO: ------------ end:二次开发代码 ----------------- // /** * Logs information about the environment, like code revision, current user, Java version, and * JVM parameters. * * @param log The logger to log the information to. * @param componentName The component name to mention in the log. * @param commandLineArgs The arguments accompanying the starting the component. */ public static void logEnvironmentInfo( Logger log, String componentName, String[] commandLineArgs) { // TODO: ------------ start:二次开发代码 --------------- // parseCommand(commandLineArgs); // TODO: ------------ end:二次开发代码 ---------------- // if (log.isInfoEnabled()) { RevisionInformation rev = getRevisionInformation(); String version = getVersion(); String scalaVersion = getScalaVersion(); String jvmVersion = getJvmVersion(); String[] options = getJvmStartupOptionsArray(); String javaHome = System.getenv("JAVA_HOME"); String inheritedLogs = System.getenv("FLINK_INHERITED_LOGS"); long maxHeapMegabytes = getMaxJvmHeapMemory() >>> 20; if (inheritedLogs != null) { log.info( "--------------------------------------------------------------------------------"); log.info(" Preconfiguration: "); log.info(inheritedLogs); } log.info( "--------------------------------------------------------------------------------"); log.info( " Starting " + componentName + " (Version: " + version + ", Scala: " + scalaVersion + ", " + "Rev:" + rev.commitId + ", " + "Date:" + rev.commitDate + ")"); log.info(" OS current user: " + System.getProperty("user.name")); log.info(" Current Hadoop/Kerberos user: " + getHadoopUser()); log.info(" JVM: " + jvmVersion); log.info(" Maximum heap size: " + maxHeapMegabytes + " MiBytes"); log.info(" JAVA_HOME: " + (javaHome == null ? "(not set)" : javaHome)); String hadoopVersionString = getHadoopVersionString(); if (hadoopVersionString != null) { log.info(" Hadoop version: " + hadoopVersionString); } else { log.info(" No Hadoop Dependency available"); } if (options.length == 0) { log.info(" JVM Options: (none)"); } else { log.info(" JVM Options:"); for (String s : options) { log.info(" " + s); } } if (commandLineArgs == null || commandLineArgs.length == 0) { log.info(" Program Arguments: (none)"); } else { log.info(" Program Arguments:"); for (String s : commandLineArgs) { if (GlobalConfiguration.isSensitive(s)) { log.info( " " + GlobalConfiguration.HIDDEN_CONTENT + " (sensitive information)"); } else { log.info(" " + s); } } } log.info(" Classpath: " + System.getProperty("java.class.path")); log.info( "--------------------------------------------------------------------------------"); } } public static String getHadoopVersionString() { try { Class versionInfoClass = Class.forName( "org.apache.hadoop.util.VersionInfo", false, EnvironmentInformation.class.getClassLoader()); Method method = versionInfoClass.getMethod("getVersion"); return (String) method.invoke(null); } catch (ClassNotFoundException | NoSuchMethodException e) { return null; } catch (Throwable e) { LOG.error("Cannot invoke VersionInfo.getVersion reflectively.", e); return null; } } // -------------------------------------------------------------------------------------------- /** Don't instantiate this class */ private EnvironmentInformation() {} // -------------------------------------------------------------------------------------------- /** * Revision information encapsulates information about the source code revision of the Flink * code. */ public static class RevisionInformation { /** The git commit id (hash) */ public final String commitId; /** The git commit date */ public final String commitDate; public RevisionInformation(String commitId, String commitDate) { this.commitId = commitId; this.commitDate = commitDate; } } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.12/org/apache/flink/table/api/internal/TableEnvironmentImpl.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.table.api.internal; import org.apache.flink.annotation.Internal; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.JobExecutionResult; import org.apache.flink.api.dag.Pipeline; import org.apache.flink.api.dag.Transformation; import org.apache.flink.configuration.PipelineOptions; import org.apache.flink.core.execution.JobClient; import org.apache.flink.table.api.*; import org.apache.flink.table.catalog.*; import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; import org.apache.flink.table.catalog.exceptions.TableNotExistException; import org.apache.flink.table.catalog.exceptions.*; import org.apache.flink.table.delegation.*; import org.apache.flink.table.descriptors.ConnectTableDescriptor; import org.apache.flink.table.descriptors.ConnectorDescriptor; import org.apache.flink.table.descriptors.StreamTableDescriptor; import org.apache.flink.table.expressions.ApiExpressionUtils; import org.apache.flink.table.expressions.Expression; import org.apache.flink.table.factories.CatalogFactory; import org.apache.flink.table.factories.ComponentFactoryService; import org.apache.flink.table.factories.TableFactoryService; import org.apache.flink.table.functions.ScalarFunction; import org.apache.flink.table.functions.UserDefinedFunction; import org.apache.flink.table.functions.UserDefinedFunctionHelper; import org.apache.flink.table.module.Module; import org.apache.flink.table.module.ModuleManager; import org.apache.flink.table.operations.*; import org.apache.flink.table.operations.ddl.*; import org.apache.flink.table.operations.utils.OperationTreeBuilder; import org.apache.flink.table.sinks.TableSink; import org.apache.flink.table.sources.TableSource; import org.apache.flink.table.sources.TableSourceValidation; import org.apache.flink.table.types.AbstractDataType; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.utils.PrintUtils; import org.apache.flink.table.utils.TableSchemaUtils; import org.apache.flink.types.Row; import java.lang.reflect.Method; import java.util.*; import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Collectors; import java.util.stream.StreamSupport; /** * Implementation of {@link TableEnvironment} that works exclusively with Table API interfaces. Only * {@link TableSource} is supported as an input and {@link TableSink} as an output. It also does not * bind to any particular {@code StreamExecutionEnvironment}. */ @Internal public class TableEnvironmentImpl implements TableEnvironmentInternal { // Flag that tells if the TableSource/TableSink used in this environment is stream table // source/sink, // and this should always be true. This avoids too many hard code. private static final boolean IS_STREAM_TABLE = true; private final CatalogManager catalogManager; private final ModuleManager moduleManager; private final OperationTreeBuilder operationTreeBuilder; private final List bufferedModifyOperations = new ArrayList<>(); protected final TableConfig tableConfig; protected final Executor execEnv; protected final FunctionCatalog functionCatalog; protected final Planner planner; protected final Parser parser; private final boolean isStreamingMode; private final ClassLoader userClassLoader; private static final String UNSUPPORTED_QUERY_IN_SQL_UPDATE_MSG = "Unsupported SQL query! sqlUpdate() only accepts a single SQL statement of type " + "INSERT, CREATE TABLE, DROP TABLE, ALTER TABLE, USE CATALOG, USE [CATALOG.]DATABASE, " + "CREATE DATABASE, DROP DATABASE, ALTER DATABASE, CREATE FUNCTION, DROP FUNCTION, ALTER FUNCTION, " + "CREATE CATALOG, DROP CATALOG, CREATE VIEW, DROP VIEW."; private static final String UNSUPPORTED_QUERY_IN_EXECUTE_SQL_MSG = "Unsupported SQL query! executeSql() only accepts a single SQL statement of type " + "CREATE TABLE, DROP TABLE, ALTER TABLE, CREATE DATABASE, DROP DATABASE, ALTER DATABASE, " + "CREATE FUNCTION, DROP FUNCTION, ALTER FUNCTION, CREATE CATALOG, DROP CATALOG, " + "USE CATALOG, USE [CATALOG.]DATABASE, SHOW CATALOGS, SHOW DATABASES, SHOW TABLES, SHOW FUNCTIONS, SHOW PARTITIONS" + "CREATE VIEW, DROP VIEW, SHOW VIEWS, INSERT, DESCRIBE."; /** Provides necessary methods for {@link ConnectTableDescriptor}. */ private final Registration registration = new Registration() { @Override public void createTemporaryTable(String path, CatalogBaseTable table) { UnresolvedIdentifier unresolvedIdentifier = parser.parseIdentifier(path); ObjectIdentifier objectIdentifier = catalogManager.qualifyIdentifier(unresolvedIdentifier); catalogManager.createTemporaryTable(table, objectIdentifier, false); } }; protected TableEnvironmentImpl( CatalogManager catalogManager, ModuleManager moduleManager, TableConfig tableConfig, Executor executor, FunctionCatalog functionCatalog, Planner planner, boolean isStreamingMode, ClassLoader userClassLoader) { this.catalogManager = catalogManager; this.catalogManager.setCatalogTableSchemaResolver( new CatalogTableSchemaResolver(planner.getParser(), isStreamingMode)); this.moduleManager = moduleManager; this.execEnv = executor; this.tableConfig = tableConfig; this.functionCatalog = functionCatalog; this.planner = planner; this.parser = planner.getParser(); this.isStreamingMode = isStreamingMode; this.userClassLoader = userClassLoader; this.operationTreeBuilder = OperationTreeBuilder.create( tableConfig, functionCatalog.asLookup(parser::parseIdentifier), catalogManager.getDataTypeFactory(), path -> { try { UnresolvedIdentifier unresolvedIdentifier = parser.parseIdentifier(path); Optional catalogQueryOperation = scanInternal(unresolvedIdentifier); return catalogQueryOperation.map( t -> ApiExpressionUtils.tableRef(path, t)); } catch (SqlParserException ex) { // The TableLookup is used during resolution of expressions and it // actually might not be an // identifier of a table. It might be a reference to some other // object such as column, local // reference etc. This method should return empty optional in such // cases to fallback for other // identifiers resolution. return Optional.empty(); } }, isStreamingMode); } public static TableEnvironmentImpl create(EnvironmentSettings settings) { // temporary solution until FLINK-15635 is fixed ClassLoader classLoader = Thread.currentThread().getContextClassLoader(); TableConfig tableConfig = new TableConfig(); ModuleManager moduleManager = new ModuleManager(); CatalogManager catalogManager = CatalogManager.newBuilder() .classLoader(classLoader) .config(tableConfig.getConfiguration()) .defaultCatalog( settings.getBuiltInCatalogName(), new GenericInMemoryCatalog( settings.getBuiltInCatalogName(), settings.getBuiltInDatabaseName())) .build(); FunctionCatalog functionCatalog = new FunctionCatalog(tableConfig, catalogManager, moduleManager); Map executorProperties = settings.toExecutorProperties(); Executor executor = ComponentFactoryService.find(ExecutorFactory.class, executorProperties) .create(executorProperties); Map plannerProperties = settings.toPlannerProperties(); Planner planner = ComponentFactoryService.find(PlannerFactory.class, plannerProperties) .create( plannerProperties, executor, tableConfig, functionCatalog, catalogManager); return new TableEnvironmentImpl( catalogManager, moduleManager, tableConfig, executor, functionCatalog, planner, settings.isStreamingMode(), classLoader); } @Override public Table fromValues(Object... values) { return fromValues(Arrays.asList(values)); } @Override public Table fromValues(AbstractDataType rowType, Object... values) { return fromValues(rowType, Arrays.asList(values)); } @Override public Table fromValues(Expression... values) { return createTable(operationTreeBuilder.values(values)); } @Override public Table fromValues(AbstractDataType rowType, Expression... values) { final DataType resolvedDataType = catalogManager.getDataTypeFactory().createDataType(rowType); return createTable(operationTreeBuilder.values(resolvedDataType, values)); } @Override public Table fromValues(Iterable values) { Expression[] exprs = StreamSupport.stream(values.spliterator(), false) .map(ApiExpressionUtils::objectToExpression) .toArray(Expression[]::new); return fromValues(exprs); } @Override public Table fromValues(AbstractDataType rowType, Iterable values) { Expression[] exprs = StreamSupport.stream(values.spliterator(), false) .map(ApiExpressionUtils::objectToExpression) .toArray(Expression[]::new); return fromValues(rowType, exprs); } @VisibleForTesting public Planner getPlanner() { return planner; } @Override public Table fromTableSource(TableSource source) { // only accept StreamTableSource and LookupableTableSource here // TODO should add a validation, while StreamTableSource is in flink-table-api-java-bridge // module now return createTable(new TableSourceQueryOperation<>(source, !IS_STREAM_TABLE)); } @Override public void registerCatalog(String catalogName, Catalog catalog) { catalogManager.registerCatalog(catalogName, catalog); } @Override public Optional getCatalog(String catalogName) { return catalogManager.getCatalog(catalogName); } @Override public void loadModule(String moduleName, Module module) { moduleManager.loadModule(moduleName, module); } @Override public void unloadModule(String moduleName) { moduleManager.unloadModule(moduleName); } @Override public void registerFunction(String name, ScalarFunction function) { functionCatalog.registerTempSystemScalarFunction(name, function); } @Override public void createTemporarySystemFunction( String name, Class functionClass) { final UserDefinedFunction functionInstance = UserDefinedFunctionHelper.instantiateFunction(functionClass); createTemporarySystemFunction(name, functionInstance); } @Override public void createTemporarySystemFunction(String name, UserDefinedFunction functionInstance) { functionCatalog.registerTemporarySystemFunction(name, functionInstance, false); } @Override public boolean dropTemporarySystemFunction(String name) { return functionCatalog.dropTemporarySystemFunction(name, true); } @Override public void createFunction(String path, Class functionClass) { createFunction(path, functionClass, false); } @Override public void createFunction( String path, Class functionClass, boolean ignoreIfExists) { final UnresolvedIdentifier unresolvedIdentifier = parser.parseIdentifier(path); functionCatalog.registerCatalogFunction( unresolvedIdentifier, functionClass, ignoreIfExists); } @Override public boolean dropFunction(String path) { final UnresolvedIdentifier unresolvedIdentifier = parser.parseIdentifier(path); return functionCatalog.dropCatalogFunction(unresolvedIdentifier, true); } @Override public void createTemporaryFunction( String path, Class functionClass) { final UserDefinedFunction functionInstance = UserDefinedFunctionHelper.instantiateFunction(functionClass); createTemporaryFunction(path, functionInstance); } @Override public void createTemporaryFunction(String path, UserDefinedFunction functionInstance) { final UnresolvedIdentifier unresolvedIdentifier = parser.parseIdentifier(path); functionCatalog.registerTemporaryCatalogFunction( unresolvedIdentifier, functionInstance, false); } @Override public boolean dropTemporaryFunction(String path) { final UnresolvedIdentifier unresolvedIdentifier = parser.parseIdentifier(path); return functionCatalog.dropTemporaryCatalogFunction(unresolvedIdentifier, true); } @Override public void registerTable(String name, Table table) { UnresolvedIdentifier identifier = UnresolvedIdentifier.of(name); createTemporaryView(identifier, table); } @Override public void createTemporaryView(String path, Table view) { UnresolvedIdentifier identifier = parser.parseIdentifier(path); createTemporaryView(identifier, view); } private void createTemporaryView(UnresolvedIdentifier identifier, Table view) { if (((TableImpl) view).getTableEnvironment() != this) { throw new TableException( "Only table API objects that belong to this TableEnvironment can be registered."); } ObjectIdentifier tableIdentifier = catalogManager.qualifyIdentifier(identifier); QueryOperation queryOperation = qualifyQueryOperation(tableIdentifier, view.getQueryOperation()); CatalogBaseTable tableTable = new QueryOperationCatalogView(queryOperation); catalogManager.createTemporaryTable(tableTable, tableIdentifier, false); } @Override public Table scan(String... tablePath) { UnresolvedIdentifier unresolvedIdentifier = UnresolvedIdentifier.of(tablePath); return scanInternal(unresolvedIdentifier) .map(this::createTable) .orElseThrow( () -> new ValidationException( String.format( "Table %s was not found.", unresolvedIdentifier))); } @Override public Table from(String path) { UnresolvedIdentifier unresolvedIdentifier = parser.parseIdentifier(path); return scanInternal(unresolvedIdentifier) .map(this::createTable) .orElseThrow( () -> new ValidationException( String.format( "Table %s was not found.", unresolvedIdentifier))); } @Override public void insertInto(String targetPath, Table table) { UnresolvedIdentifier unresolvedIdentifier = parser.parseIdentifier(targetPath); insertIntoInternal(unresolvedIdentifier, table); } @Override public void insertInto(Table table, String sinkPath, String... sinkPathContinued) { List fullPath = new ArrayList<>(Arrays.asList(sinkPathContinued)); fullPath.add(0, sinkPath); UnresolvedIdentifier unresolvedIdentifier = UnresolvedIdentifier.of(fullPath); insertIntoInternal(unresolvedIdentifier, table); } private void insertIntoInternal(UnresolvedIdentifier unresolvedIdentifier, Table table) { ObjectIdentifier objectIdentifier = catalogManager.qualifyIdentifier(unresolvedIdentifier); List modifyOperations = Collections.singletonList( new CatalogSinkModifyOperation( objectIdentifier, table.getQueryOperation())); buffer(modifyOperations); } private Optional scanInternal(UnresolvedIdentifier identifier) { ObjectIdentifier tableIdentifier = catalogManager.qualifyIdentifier(identifier); return catalogManager .getTable(tableIdentifier) .map(t -> new CatalogQueryOperation(tableIdentifier, t.getResolvedSchema())); } @Override public ConnectTableDescriptor connect(ConnectorDescriptor connectorDescriptor) { return new StreamTableDescriptor(registration, connectorDescriptor); } @Override public String[] listCatalogs() { return catalogManager.listCatalogs().stream().sorted().toArray(String[]::new); } @Override public String[] listModules() { return moduleManager.listModules().toArray(new String[0]); } @Override public String[] listDatabases() { return catalogManager .getCatalog(catalogManager.getCurrentCatalog()) .get() .listDatabases() .toArray(new String[0]); } @Override public String[] listTables() { return catalogManager.listTables().stream().sorted().toArray(String[]::new); } @Override public String[] listViews() { return catalogManager.listViews().stream().sorted().toArray(String[]::new); } @Override public String[] listTemporaryTables() { return catalogManager.listTemporaryTables().stream().sorted().toArray(String[]::new); } @Override public String[] listTemporaryViews() { return catalogManager.listTemporaryViews().stream().sorted().toArray(String[]::new); } @Override public boolean dropTemporaryTable(String path) { UnresolvedIdentifier unresolvedIdentifier = parser.parseIdentifier(path); ObjectIdentifier identifier = catalogManager.qualifyIdentifier(unresolvedIdentifier); try { catalogManager.dropTemporaryTable(identifier, false); return true; } catch (ValidationException e) { return false; } } @Override public boolean dropTemporaryView(String path) { UnresolvedIdentifier unresolvedIdentifier = parser.parseIdentifier(path); ObjectIdentifier identifier = catalogManager.qualifyIdentifier(unresolvedIdentifier); try { catalogManager.dropTemporaryView(identifier, false); return true; } catch (ValidationException e) { return false; } } @Override public String[] listUserDefinedFunctions() { return functionCatalog.getUserDefinedFunctions(); } @Override public String[] listFunctions() { return functionCatalog.getFunctions(); } @Override public String explain(Table table) { return explain(table, false); } @Override public String explain(Table table, boolean extended) { return planner.explain( Collections.singletonList(table.getQueryOperation()), getExplainDetails(extended)); } @Override public String explain(boolean extended) { List operations = bufferedModifyOperations.stream() .map(o -> (Operation) o) .collect(Collectors.toList()); return planner.explain(operations, getExplainDetails(extended)); } @Override public String explainSql(String statement, ExplainDetail... extraDetails) { List operations = parser.parse(statement); if (operations.size() != 1) { throw new TableException( "Unsupported SQL query! explainSql() only accepts a single SQL query."); } return planner.explain(operations, extraDetails); } @Override public String explainInternal(List operations, ExplainDetail... extraDetails) { return planner.explain(operations, extraDetails); } @Override public String[] getCompletionHints(String statement, int position) { return planner.getCompletionHints(statement, position); } @Override public Table sqlQuery(String query) { List operations = parser.parse(query); if (operations.size() != 1) { throw new ValidationException( "Unsupported SQL query! sqlQuery() only accepts a single SQL query."); } Operation operation = operations.get(0); if (operation instanceof QueryOperation && !(operation instanceof ModifyOperation)) { return createTable((QueryOperation) operation); } else { throw new ValidationException( "Unsupported SQL query! sqlQuery() only accepts a single SQL query of type " + "SELECT, UNION, INTERSECT, EXCEPT, VALUES, and ORDER_BY."); } } // TODO: ------------ start:二次开发代码 --------------- // private static Method sqlParseMethod = null; private static AtomicBoolean canParse = new AtomicBoolean(true); // TODO: ------------ end:二次开发代码 ----------------- // @Override public TableResult executeSql(String statement) { // TODO: ------------ start:二次开发代码 --------------- // // 使用反射获取进行sql收集,避免api找不到的异常 try { if (canParse.get()) { if (sqlParseMethod == null) { Class clazz = Class.forName("com.zto.fire.flink.sql.FlinkSqlParser"); sqlParseMethod = clazz.getMethod("sqlParse", String.class); sqlParseMethod.setAccessible(true); } if (sqlParseMethod != null) { sqlParseMethod.invoke(null, statement); } } } catch (Exception e) { try { // 当调用sql解析相关api发生异常时,认为api无法被类加载器所加载,后续将不会尝试调用 canParse.set(false); } catch (Exception e1) {} } // TODO: ------------ end:二次开发代码 ----------------- // List operations = parser.parse(statement); if (operations.size() != 1) { throw new TableException(UNSUPPORTED_QUERY_IN_EXECUTE_SQL_MSG); } return executeOperation(operations.get(0)); } @Override public StatementSet createStatementSet() { return new StatementSetImpl(this); } @Override public TableResult executeInternal(List operations) { List> transformations = translate(operations); List sinkIdentifierNames = extractSinkIdentifierNames(operations); String jobName = getJobName("insert-into_" + String.join(",", sinkIdentifierNames)); Pipeline pipeline = execEnv.createPipeline(transformations, tableConfig, jobName); try { JobClient jobClient = execEnv.executeAsync(pipeline); TableSchema.Builder builder = TableSchema.builder(); Object[] affectedRowCounts = new Long[operations.size()]; for (int i = 0; i < operations.size(); ++i) { // use sink identifier name as field name builder.field(sinkIdentifierNames.get(i), DataTypes.BIGINT()); affectedRowCounts[i] = -1L; } return TableResultImpl.builder() .jobClient(jobClient) .resultKind(ResultKind.SUCCESS_WITH_CONTENT) .tableSchema(builder.build()) .data( new InsertResultIterator( jobClient, Row.of(affectedRowCounts), userClassLoader)) .build(); } catch (Exception e) { throw new TableException("Failed to execute sql", e); } } @Override public TableResult executeInternal(QueryOperation operation) { SelectSinkOperation sinkOperation = new SelectSinkOperation(operation); List> transformations = translate(Collections.singletonList(sinkOperation)); String jobName = getJobName("collect"); Pipeline pipeline = execEnv.createPipeline(transformations, tableConfig, jobName); try { JobClient jobClient = execEnv.executeAsync(pipeline); SelectResultProvider resultProvider = sinkOperation.getSelectResultProvider(); resultProvider.setJobClient(jobClient); return TableResultImpl.builder() .jobClient(jobClient) .resultKind(ResultKind.SUCCESS_WITH_CONTENT) .tableSchema(operation.getTableSchema()) .data(resultProvider.getResultIterator()) .setPrintStyle( TableResultImpl.PrintStyle.tableau( PrintUtils.MAX_COLUMN_WIDTH, PrintUtils.NULL_COLUMN, true, isStreamingMode)) .build(); } catch (Exception e) { throw new TableException("Failed to execute sql", e); } } @Override public void sqlUpdate(String stmt) { System.out.println("sqlUpdate->\n" + stmt); List operations = parser.parse(stmt); if (operations.size() != 1) { throw new TableException(UNSUPPORTED_QUERY_IN_SQL_UPDATE_MSG); } Operation operation = operations.get(0); if (operation instanceof ModifyOperation) { buffer(Collections.singletonList((ModifyOperation) operation)); } else if (operation instanceof CreateTableOperation || operation instanceof DropTableOperation || operation instanceof AlterTableOperation || operation instanceof CreateViewOperation || operation instanceof DropViewOperation || operation instanceof CreateDatabaseOperation || operation instanceof DropDatabaseOperation || operation instanceof AlterDatabaseOperation || operation instanceof CreateCatalogFunctionOperation || operation instanceof CreateTempSystemFunctionOperation || operation instanceof DropCatalogFunctionOperation || operation instanceof DropTempSystemFunctionOperation || operation instanceof AlterCatalogFunctionOperation || operation instanceof CreateCatalogOperation || operation instanceof DropCatalogOperation || operation instanceof UseCatalogOperation || operation instanceof UseDatabaseOperation) { executeOperation(operation); } else { throw new TableException(UNSUPPORTED_QUERY_IN_SQL_UPDATE_MSG); } } private TableResult executeOperation(Operation operation) { if (operation instanceof ModifyOperation) { return executeInternal(Collections.singletonList((ModifyOperation) operation)); } else if (operation instanceof CreateTableOperation) { CreateTableOperation createTableOperation = (CreateTableOperation) operation; if (createTableOperation.isTemporary()) { catalogManager.createTemporaryTable( createTableOperation.getCatalogTable(), createTableOperation.getTableIdentifier(), createTableOperation.isIgnoreIfExists()); } else { catalogManager.createTable( createTableOperation.getCatalogTable(), createTableOperation.getTableIdentifier(), createTableOperation.isIgnoreIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof DropTableOperation) { DropTableOperation dropTableOperation = (DropTableOperation) operation; if (dropTableOperation.isTemporary()) { catalogManager.dropTemporaryTable( dropTableOperation.getTableIdentifier(), dropTableOperation.isIfExists()); } else { catalogManager.dropTable( dropTableOperation.getTableIdentifier(), dropTableOperation.isIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof AlterTableOperation) { AlterTableOperation alterTableOperation = (AlterTableOperation) operation; Catalog catalog = getCatalogOrThrowException( alterTableOperation.getTableIdentifier().getCatalogName()); String exMsg = getDDLOpExecuteErrorMsg(alterTableOperation.asSummaryString()); try { if (alterTableOperation instanceof AlterTableRenameOperation) { AlterTableRenameOperation alterTableRenameOp = (AlterTableRenameOperation) operation; catalog.renameTable( alterTableRenameOp.getTableIdentifier().toObjectPath(), alterTableRenameOp.getNewTableIdentifier().getObjectName(), false); } else if (alterTableOperation instanceof AlterTablePropertiesOperation) { AlterTablePropertiesOperation alterTablePropertiesOp = (AlterTablePropertiesOperation) operation; catalog.alterTable( alterTablePropertiesOp.getTableIdentifier().toObjectPath(), alterTablePropertiesOp.getCatalogTable(), false); } else if (alterTableOperation instanceof AlterTableAddConstraintOperation) { AlterTableAddConstraintOperation addConstraintOP = (AlterTableAddConstraintOperation) operation; CatalogTable oriTable = (CatalogTable) catalogManager .getTable(addConstraintOP.getTableIdentifier()) .get() .getTable(); TableSchema.Builder builder = TableSchemaUtils.builderWithGivenSchema(oriTable.getSchema()); if (addConstraintOP.getConstraintName().isPresent()) { builder.primaryKey( addConstraintOP.getConstraintName().get(), addConstraintOP.getColumnNames()); } else { builder.primaryKey(addConstraintOP.getColumnNames()); } CatalogTable newTable = new CatalogTableImpl( builder.build(), oriTable.getPartitionKeys(), oriTable.getOptions(), oriTable.getComment()); catalog.alterTable( addConstraintOP.getTableIdentifier().toObjectPath(), newTable, false); } else if (alterTableOperation instanceof AlterTableDropConstraintOperation) { AlterTableDropConstraintOperation dropConstraintOperation = (AlterTableDropConstraintOperation) operation; CatalogTable oriTable = (CatalogTable) catalogManager .getTable(dropConstraintOperation.getTableIdentifier()) .get() .getTable(); CatalogTable newTable = new CatalogTableImpl( TableSchemaUtils.dropConstraint( oriTable.getSchema(), dropConstraintOperation.getConstraintName()), oriTable.getPartitionKeys(), oriTable.getOptions(), oriTable.getComment()); catalog.alterTable( dropConstraintOperation.getTableIdentifier().toObjectPath(), newTable, false); } else if (alterTableOperation instanceof AlterPartitionPropertiesOperation) { AlterPartitionPropertiesOperation alterPartPropsOp = (AlterPartitionPropertiesOperation) operation; catalog.alterPartition( alterPartPropsOp.getTableIdentifier().toObjectPath(), alterPartPropsOp.getPartitionSpec(), alterPartPropsOp.getCatalogPartition(), false); } else if (alterTableOperation instanceof AlterTableSchemaOperation) { AlterTableSchemaOperation alterTableSchemaOperation = (AlterTableSchemaOperation) alterTableOperation; catalog.alterTable( alterTableSchemaOperation.getTableIdentifier().toObjectPath(), alterTableSchemaOperation.getCatalogTable(), false); } else if (alterTableOperation instanceof AddPartitionsOperation) { AddPartitionsOperation addPartitionsOperation = (AddPartitionsOperation) alterTableOperation; List specs = addPartitionsOperation.getPartitionSpecs(); List partitions = addPartitionsOperation.getCatalogPartitions(); boolean ifNotExists = addPartitionsOperation.ifNotExists(); ObjectPath tablePath = addPartitionsOperation.getTableIdentifier().toObjectPath(); for (int i = 0; i < specs.size(); i++) { catalog.createPartition( tablePath, specs.get(i), partitions.get(i), ifNotExists); } } else if (alterTableOperation instanceof DropPartitionsOperation) { DropPartitionsOperation dropPartitionsOperation = (DropPartitionsOperation) alterTableOperation; ObjectPath tablePath = dropPartitionsOperation.getTableIdentifier().toObjectPath(); boolean ifExists = dropPartitionsOperation.ifExists(); for (CatalogPartitionSpec spec : dropPartitionsOperation.getPartitionSpecs()) { catalog.dropPartition(tablePath, spec, ifExists); } } return TableResultImpl.TABLE_RESULT_OK; } catch (TableAlreadyExistException | TableNotExistException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof CreateViewOperation) { CreateViewOperation createViewOperation = (CreateViewOperation) operation; if (createViewOperation.isTemporary()) { catalogManager.createTemporaryTable( createViewOperation.getCatalogView(), createViewOperation.getViewIdentifier(), createViewOperation.isIgnoreIfExists()); } else { catalogManager.createTable( createViewOperation.getCatalogView(), createViewOperation.getViewIdentifier(), createViewOperation.isIgnoreIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof DropViewOperation) { DropViewOperation dropViewOperation = (DropViewOperation) operation; if (dropViewOperation.isTemporary()) { catalogManager.dropTemporaryView( dropViewOperation.getViewIdentifier(), dropViewOperation.isIfExists()); } else { catalogManager.dropView( dropViewOperation.getViewIdentifier(), dropViewOperation.isIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof AlterViewOperation) { AlterViewOperation alterViewOperation = (AlterViewOperation) operation; Catalog catalog = getCatalogOrThrowException( alterViewOperation.getViewIdentifier().getCatalogName()); String exMsg = getDDLOpExecuteErrorMsg(alterViewOperation.asSummaryString()); try { if (alterViewOperation instanceof AlterViewRenameOperation) { AlterViewRenameOperation alterTableRenameOp = (AlterViewRenameOperation) operation; catalog.renameTable( alterTableRenameOp.getViewIdentifier().toObjectPath(), alterTableRenameOp.getNewViewIdentifier().getObjectName(), false); } else if (alterViewOperation instanceof AlterViewPropertiesOperation) { AlterViewPropertiesOperation alterTablePropertiesOp = (AlterViewPropertiesOperation) operation; catalog.alterTable( alterTablePropertiesOp.getViewIdentifier().toObjectPath(), alterTablePropertiesOp.getCatalogView(), false); } else if (alterViewOperation instanceof AlterViewAsOperation) { AlterViewAsOperation alterViewAsOperation = (AlterViewAsOperation) alterViewOperation; catalog.alterTable( alterViewAsOperation.getViewIdentifier().toObjectPath(), alterViewAsOperation.getNewView(), false); } return TableResultImpl.TABLE_RESULT_OK; } catch (TableAlreadyExistException | TableNotExistException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof CreateDatabaseOperation) { CreateDatabaseOperation createDatabaseOperation = (CreateDatabaseOperation) operation; Catalog catalog = getCatalogOrThrowException(createDatabaseOperation.getCatalogName()); String exMsg = getDDLOpExecuteErrorMsg(createDatabaseOperation.asSummaryString()); try { catalog.createDatabase( createDatabaseOperation.getDatabaseName(), createDatabaseOperation.getCatalogDatabase(), createDatabaseOperation.isIgnoreIfExists()); return TableResultImpl.TABLE_RESULT_OK; } catch (DatabaseAlreadyExistException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof DropDatabaseOperation) { DropDatabaseOperation dropDatabaseOperation = (DropDatabaseOperation) operation; Catalog catalog = getCatalogOrThrowException(dropDatabaseOperation.getCatalogName()); String exMsg = getDDLOpExecuteErrorMsg(dropDatabaseOperation.asSummaryString()); try { catalog.dropDatabase( dropDatabaseOperation.getDatabaseName(), dropDatabaseOperation.isIfExists(), dropDatabaseOperation.isCascade()); return TableResultImpl.TABLE_RESULT_OK; } catch (DatabaseNotExistException | DatabaseNotEmptyException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof AlterDatabaseOperation) { AlterDatabaseOperation alterDatabaseOperation = (AlterDatabaseOperation) operation; Catalog catalog = getCatalogOrThrowException(alterDatabaseOperation.getCatalogName()); String exMsg = getDDLOpExecuteErrorMsg(alterDatabaseOperation.asSummaryString()); try { catalog.alterDatabase( alterDatabaseOperation.getDatabaseName(), alterDatabaseOperation.getCatalogDatabase(), false); return TableResultImpl.TABLE_RESULT_OK; } catch (DatabaseNotExistException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof CreateCatalogFunctionOperation) { return createCatalogFunction((CreateCatalogFunctionOperation) operation); } else if (operation instanceof CreateTempSystemFunctionOperation) { return createSystemFunction((CreateTempSystemFunctionOperation) operation); } else if (operation instanceof DropCatalogFunctionOperation) { return dropCatalogFunction((DropCatalogFunctionOperation) operation); } else if (operation instanceof DropTempSystemFunctionOperation) { return dropSystemFunction((DropTempSystemFunctionOperation) operation); } else if (operation instanceof AlterCatalogFunctionOperation) { return alterCatalogFunction((AlterCatalogFunctionOperation) operation); } else if (operation instanceof CreateCatalogOperation) { return createCatalog((CreateCatalogOperation) operation); } else if (operation instanceof DropCatalogOperation) { DropCatalogOperation dropCatalogOperation = (DropCatalogOperation) operation; String exMsg = getDDLOpExecuteErrorMsg(dropCatalogOperation.asSummaryString()); try { catalogManager.unregisterCatalog( dropCatalogOperation.getCatalogName(), dropCatalogOperation.isIfExists()); return TableResultImpl.TABLE_RESULT_OK; } catch (CatalogException e) { throw new ValidationException(exMsg, e); } } else if (operation instanceof UseCatalogOperation) { UseCatalogOperation useCatalogOperation = (UseCatalogOperation) operation; catalogManager.setCurrentCatalog(useCatalogOperation.getCatalogName()); return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof UseDatabaseOperation) { UseDatabaseOperation useDatabaseOperation = (UseDatabaseOperation) operation; catalogManager.setCurrentCatalog(useDatabaseOperation.getCatalogName()); catalogManager.setCurrentDatabase(useDatabaseOperation.getDatabaseName()); return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof ShowCatalogsOperation) { return buildShowResult("catalog name", listCatalogs()); } else if (operation instanceof ShowCurrentCatalogOperation) { return buildShowResult( "current catalog name", new String[] {catalogManager.getCurrentCatalog()}); } else if (operation instanceof ShowDatabasesOperation) { return buildShowResult("database name", listDatabases()); } else if (operation instanceof ShowCurrentDatabaseOperation) { return buildShowResult( "current database name", new String[] {catalogManager.getCurrentDatabase()}); } else if (operation instanceof ShowTablesOperation) { return buildShowResult("table name", listTables()); } else if (operation instanceof ShowFunctionsOperation) { return buildShowResult("function name", listFunctions()); } else if (operation instanceof ShowViewsOperation) { return buildShowResult("view name", listViews()); } else if (operation instanceof ShowPartitionsOperation) { String exMsg = getDDLOpExecuteErrorMsg(operation.asSummaryString()); try { ShowPartitionsOperation showPartitionsOperation = (ShowPartitionsOperation) operation; Catalog catalog = getCatalogOrThrowException( showPartitionsOperation.getTableIdentifier().getCatalogName()); ObjectPath tablePath = showPartitionsOperation.getTableIdentifier().toObjectPath(); CatalogPartitionSpec partitionSpec = showPartitionsOperation.getPartitionSpec(); List partitionSpecs = partitionSpec == null ? catalog.listPartitions(tablePath) : catalog.listPartitions(tablePath, partitionSpec); List partitionNames = new ArrayList<>(partitionSpecs.size()); for (CatalogPartitionSpec spec : partitionSpecs) { List partitionKVs = new ArrayList<>(spec.getPartitionSpec().size()); for (Map.Entry partitionKV : spec.getPartitionSpec().entrySet()) { partitionKVs.add(partitionKV.getKey() + "=" + partitionKV.getValue()); } partitionNames.add(String.join("/", partitionKVs)); } return buildShowResult("partition name", partitionNames.toArray(new String[0])); } catch (TableNotExistException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof ExplainOperation) { String explanation = planner.explain( Collections.singletonList(((ExplainOperation) operation).getChild())); return TableResultImpl.builder() .resultKind(ResultKind.SUCCESS_WITH_CONTENT) .tableSchema(TableSchema.builder().field("result", DataTypes.STRING()).build()) .data(Collections.singletonList(Row.of(explanation))) .setPrintStyle(TableResultImpl.PrintStyle.rawContent()) .build(); } else if (operation instanceof DescribeTableOperation) { DescribeTableOperation describeTableOperation = (DescribeTableOperation) operation; Optional result = catalogManager.getTable(describeTableOperation.getSqlIdentifier()); if (result.isPresent()) { return buildDescribeResult(result.get().getResolvedSchema()); } else { throw new ValidationException( String.format( "Tables or views with the identifier '%s' doesn't exist", describeTableOperation.getSqlIdentifier().asSummaryString())); } } else if (operation instanceof QueryOperation) { return executeInternal((QueryOperation) operation); } else { throw new TableException(UNSUPPORTED_QUERY_IN_EXECUTE_SQL_MSG); } } private TableResult createCatalog(CreateCatalogOperation operation) { String exMsg = getDDLOpExecuteErrorMsg(operation.asSummaryString()); try { String catalogName = operation.getCatalogName(); Map properties = operation.getProperties(); final CatalogFactory factory = TableFactoryService.find(CatalogFactory.class, properties, userClassLoader); Catalog catalog = factory.createCatalog(catalogName, properties); catalogManager.registerCatalog(catalogName, catalog); return TableResultImpl.TABLE_RESULT_OK; } catch (CatalogException e) { throw new ValidationException(exMsg, e); } } private TableResult buildShowResult(String columnName, String[] objects) { return buildResult( new String[] {columnName}, new DataType[] {DataTypes.STRING()}, Arrays.stream(objects).map((c) -> new String[] {c}).toArray(String[][]::new)); } private TableResult buildDescribeResult(TableSchema schema) { Map fieldToWatermark = schema.getWatermarkSpecs().stream() .collect( Collectors.toMap( WatermarkSpec::getRowtimeAttribute, WatermarkSpec::getWatermarkExpr)); Map fieldToPrimaryKey = new HashMap<>(); schema.getPrimaryKey() .ifPresent( (p) -> { List columns = p.getColumns(); columns.forEach( (c) -> fieldToPrimaryKey.put( c, String.format( "PRI(%s)", String.join(", ", columns)))); }); Object[][] rows = schema.getTableColumns().stream() .map( (c) -> { final LogicalType logicalType = c.getType().getLogicalType(); return new Object[] { c.getName(), logicalType.copy(true).asSummaryString(), logicalType.isNullable(), fieldToPrimaryKey.getOrDefault(c.getName(), null), c.explainExtras().orElse(null), fieldToWatermark.getOrDefault(c.getName(), null) }; }) .toArray(Object[][]::new); return buildResult( new String[] {"name", "type", "null", "key", "extras", "watermark"}, new DataType[] { DataTypes.STRING(), DataTypes.STRING(), DataTypes.BOOLEAN(), DataTypes.STRING(), DataTypes.STRING(), DataTypes.STRING() }, rows); } private TableResult buildResult(String[] headers, DataType[] types, Object[][] rows) { return TableResultImpl.builder() .resultKind(ResultKind.SUCCESS_WITH_CONTENT) .tableSchema(TableSchema.builder().fields(headers, types).build()) .data(Arrays.stream(rows).map(Row::of).collect(Collectors.toList())) .setPrintStyle( TableResultImpl.PrintStyle.tableau(Integer.MAX_VALUE, "", false, false)) .build(); } /** * extract sink identifier names from {@link ModifyOperation}s. * *

If there are multiple ModifyOperations have same name, an index suffix will be added at * the end of the name to ensure each name is unique. */ private List extractSinkIdentifierNames(List operations) { List tableNames = new ArrayList<>(operations.size()); Map tableNameToCount = new HashMap<>(); for (ModifyOperation operation : operations) { if (operation instanceof CatalogSinkModifyOperation) { ObjectIdentifier identifier = ((CatalogSinkModifyOperation) operation).getTableIdentifier(); String fullName = identifier.asSummaryString(); tableNames.add(fullName); tableNameToCount.put(fullName, tableNameToCount.getOrDefault(fullName, 0) + 1); } else { throw new UnsupportedOperationException("Unsupported operation: " + operation); } } Map tableNameToIndex = new HashMap<>(); return tableNames.stream() .map( tableName -> { if (tableNameToCount.get(tableName) == 1) { return tableName; } else { Integer index = tableNameToIndex.getOrDefault(tableName, 0) + 1; tableNameToIndex.put(tableName, index); return tableName + "_" + index; } }) .collect(Collectors.toList()); } private String getJobName(String defaultJobName) { return tableConfig.getConfiguration().getString(PipelineOptions.NAME, defaultJobName); } /** Get catalog from catalogName or throw a ValidationException if the catalog not exists. */ private Catalog getCatalogOrThrowException(String catalogName) { return getCatalog(catalogName) .orElseThrow( () -> new ValidationException( String.format("Catalog %s does not exist", catalogName))); } private String getDDLOpExecuteErrorMsg(String action) { return String.format("Could not execute %s", action); } @Override public String getCurrentCatalog() { return catalogManager.getCurrentCatalog(); } @Override public void useCatalog(String catalogName) { catalogManager.setCurrentCatalog(catalogName); } @Override public String getCurrentDatabase() { return catalogManager.getCurrentDatabase(); } @Override public void useDatabase(String databaseName) { catalogManager.setCurrentDatabase(databaseName); } @Override public TableConfig getConfig() { return tableConfig; } @Override public JobExecutionResult execute(String jobName) throws Exception { Pipeline pipeline = execEnv.createPipeline(translateAndClearBuffer(), tableConfig, jobName); return execEnv.execute(pipeline); } @Override public Parser getParser() { return parser; } @Override public CatalogManager getCatalogManager() { return catalogManager; } /** * Subclasses can override this method to transform the given QueryOperation to a new one with * the qualified object identifier. This is needed for some QueryOperations, e.g. * JavaDataStreamQueryOperation, which doesn't know the registered identifier when created * ({@code fromDataStream(DataStream)}. But the identifier is required when converting this * QueryOperation to RelNode. */ protected QueryOperation qualifyQueryOperation( ObjectIdentifier identifier, QueryOperation queryOperation) { return queryOperation; } /** * Subclasses can override this method to add additional checks. * * @param tableSource tableSource to validate */ protected void validateTableSource(TableSource tableSource) { TableSourceValidation.validateTableSource(tableSource, tableSource.getTableSchema()); } /** * Translate the buffered operations to Transformations, and clear the buffer. * *

The buffer will be clear even if the `translate` fails. In most cases, the failure is not * retryable (e.g. type mismatch, can't generate physical plan). If the buffer is not clear * after failure, the following `translate` will also fail. */ protected List> translateAndClearBuffer() { List> transformations; try { transformations = translate(bufferedModifyOperations); } finally { bufferedModifyOperations.clear(); } return transformations; } private List> translate(List modifyOperations) { return planner.translate(modifyOperations); } private void buffer(List modifyOperations) { bufferedModifyOperations.addAll(modifyOperations); } @VisibleForTesting protected ExplainDetail[] getExplainDetails(boolean extended) { if (extended) { if (isStreamingMode) { return new ExplainDetail[] { ExplainDetail.ESTIMATED_COST, ExplainDetail.CHANGELOG_MODE }; } else { return new ExplainDetail[] {ExplainDetail.ESTIMATED_COST}; } } else { return new ExplainDetail[0]; } } @Override public void registerTableSourceInternal(String name, TableSource tableSource) { validateTableSource(tableSource); ObjectIdentifier objectIdentifier = catalogManager.qualifyIdentifier(UnresolvedIdentifier.of(name)); Optional table = getTemporaryTable(objectIdentifier); if (table.isPresent()) { if (table.get() instanceof ConnectorCatalogTable) { ConnectorCatalogTable sourceSinkTable = (ConnectorCatalogTable) table.get(); if (sourceSinkTable.getTableSource().isPresent()) { throw new ValidationException( String.format( "Table '%s' already exists. Please choose a different name.", name)); } else { // wrapper contains only sink (not source) ConnectorCatalogTable sourceAndSink = ConnectorCatalogTable.sourceAndSink( tableSource, sourceSinkTable.getTableSink().get(), !IS_STREAM_TABLE); catalogManager.dropTemporaryTable(objectIdentifier, false); catalogManager.createTemporaryTable(sourceAndSink, objectIdentifier, false); } } else { throw new ValidationException( String.format( "Table '%s' already exists. Please choose a different name.", name)); } } else { ConnectorCatalogTable source = ConnectorCatalogTable.source(tableSource, !IS_STREAM_TABLE); catalogManager.createTemporaryTable(source, objectIdentifier, false); } } @Override public void registerTableSinkInternal(String name, TableSink tableSink) { ObjectIdentifier objectIdentifier = catalogManager.qualifyIdentifier(UnresolvedIdentifier.of(name)); Optional table = getTemporaryTable(objectIdentifier); if (table.isPresent()) { if (table.get() instanceof ConnectorCatalogTable) { ConnectorCatalogTable sourceSinkTable = (ConnectorCatalogTable) table.get(); if (sourceSinkTable.getTableSink().isPresent()) { throw new ValidationException( String.format( "Table '%s' already exists. Please choose a different name.", name)); } else { // wrapper contains only sink (not source) ConnectorCatalogTable sourceAndSink = ConnectorCatalogTable.sourceAndSink( sourceSinkTable.getTableSource().get(), tableSink, !IS_STREAM_TABLE); catalogManager.dropTemporaryTable(objectIdentifier, false); catalogManager.createTemporaryTable(sourceAndSink, objectIdentifier, false); } } else { throw new ValidationException( String.format( "Table '%s' already exists. Please choose a different name.", name)); } } else { ConnectorCatalogTable sink = ConnectorCatalogTable.sink(tableSink, !IS_STREAM_TABLE); catalogManager.createTemporaryTable(sink, objectIdentifier, false); } } private Optional getTemporaryTable(ObjectIdentifier identifier) { return catalogManager .getTable(identifier) .filter(CatalogManager.TableLookupResult::isTemporary) .map(CatalogManager.TableLookupResult::getTable); } private TableResult createCatalogFunction( CreateCatalogFunctionOperation createCatalogFunctionOperation) { String exMsg = getDDLOpExecuteErrorMsg(createCatalogFunctionOperation.asSummaryString()); try { if (createCatalogFunctionOperation.isTemporary()) { functionCatalog.registerTemporaryCatalogFunction( UnresolvedIdentifier.of( createCatalogFunctionOperation.getFunctionIdentifier().toList()), createCatalogFunctionOperation.getCatalogFunction(), createCatalogFunctionOperation.isIgnoreIfExists()); } else { Catalog catalog = getCatalogOrThrowException( createCatalogFunctionOperation .getFunctionIdentifier() .getCatalogName()); catalog.createFunction( createCatalogFunctionOperation.getFunctionIdentifier().toObjectPath(), createCatalogFunctionOperation.getCatalogFunction(), createCatalogFunctionOperation.isIgnoreIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw e; } catch (FunctionAlreadyExistException e) { throw new ValidationException(e.getMessage(), e); } catch (Exception e) { throw new TableException(exMsg, e); } } private TableResult alterCatalogFunction( AlterCatalogFunctionOperation alterCatalogFunctionOperation) { String exMsg = getDDLOpExecuteErrorMsg(alterCatalogFunctionOperation.asSummaryString()); try { CatalogFunction function = alterCatalogFunctionOperation.getCatalogFunction(); if (alterCatalogFunctionOperation.isTemporary()) { throw new ValidationException("Alter temporary catalog function is not supported"); } else { Catalog catalog = getCatalogOrThrowException( alterCatalogFunctionOperation .getFunctionIdentifier() .getCatalogName()); catalog.alterFunction( alterCatalogFunctionOperation.getFunctionIdentifier().toObjectPath(), function, alterCatalogFunctionOperation.isIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw e; } catch (FunctionNotExistException e) { throw new ValidationException(e.getMessage(), e); } catch (Exception e) { throw new TableException(exMsg, e); } } private TableResult dropCatalogFunction( DropCatalogFunctionOperation dropCatalogFunctionOperation) { String exMsg = getDDLOpExecuteErrorMsg(dropCatalogFunctionOperation.asSummaryString()); try { if (dropCatalogFunctionOperation.isTemporary()) { functionCatalog.dropTempCatalogFunction( dropCatalogFunctionOperation.getFunctionIdentifier(), dropCatalogFunctionOperation.isIfExists()); } else { Catalog catalog = getCatalogOrThrowException( dropCatalogFunctionOperation .getFunctionIdentifier() .getCatalogName()); catalog.dropFunction( dropCatalogFunctionOperation.getFunctionIdentifier().toObjectPath(), dropCatalogFunctionOperation.isIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw e; } catch (FunctionNotExistException e) { throw new ValidationException(e.getMessage(), e); } catch (Exception e) { throw new TableException(exMsg, e); } } private TableResult createSystemFunction(CreateTempSystemFunctionOperation operation) { String exMsg = getDDLOpExecuteErrorMsg(operation.asSummaryString()); try { functionCatalog.registerTemporarySystemFunction( operation.getFunctionName(), operation.getFunctionClass(), operation.getFunctionLanguage(), operation.isIgnoreIfExists()); return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw e; } catch (Exception e) { throw new TableException(exMsg, e); } } private TableResult dropSystemFunction(DropTempSystemFunctionOperation operation) { try { functionCatalog.dropTemporarySystemFunction( operation.getFunctionName(), operation.isIfExists()); return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw e; } catch (Exception e) { throw new TableException(getDDLOpExecuteErrorMsg(operation.asSummaryString()), e); } } protected TableImpl createTable(QueryOperation tableOperation) { return TableImpl.createTable( this, tableOperation, operationTreeBuilder, functionCatalog.asLookup(parser::parseIdentifier)); } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.12/org/apache/flink/util/ExceptionUtils.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // // The function "stringifyException" is based on source code from the Hadoop Project // (http://hadoop.apache.org/), // licensed by the Apache Software Foundation (ASF) under the Apache License, Version 2.0. // See the NOTICE file distributed with this work for additional information regarding copyright // ownership. // package org.apache.flink.util; import com.zto.fire.common.util.ExceptionBus; import org.apache.flink.annotation.Internal; import org.apache.flink.util.function.RunnableWithException; import javax.annotation.Nullable; import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; import java.lang.reflect.Field; import java.util.Optional; import java.util.concurrent.CompletionException; import java.util.concurrent.ExecutionException; import java.util.function.Function; import java.util.function.Predicate; import static org.apache.flink.util.Preconditions.checkNotNull; /** A collection of utility functions for dealing with exceptions and exception workflows. */ @Internal public final class ExceptionUtils { /** The stringified representation of a null exception reference. */ public static final String STRINGIFIED_NULL_EXCEPTION = "(null)"; // TODO: ------------ start:二次开发代码 --------------- // /** * Makes a string representation of the exception's stack trace, or "(null)", if the exception * is null. * *

This method makes a best effort and never fails. * * @param e The exception to stringify. * @return A string with exception name and call stack. */ public static String stringifyException(final Throwable e) { return stringifyException(e, ""); } /** * Makes a string representation of the exception's stack trace, or "(null)", if the exception * is null. * *

This method makes a best effort and never fails. * * @param e The exception to stringify. * @return A string with exception name and call stack. */ public static String stringifyException(final Throwable e, String sql) { if (e == null) { return STRINGIFIED_NULL_EXCEPTION; } try { StringWriter stm = new StringWriter(); PrintWriter wrt = new PrintWriter(stm); e.printStackTrace(wrt); wrt.close(); ExceptionBus.post(e, sql); return stm.toString(); } catch (Throwable t) { return e.getClass().getName() + " (error while printing stack trace)"; } } // TODO: ------------ end:二次开发代码 --------------- // /** * Checks whether the given exception indicates a situation that may leave the JVM in a * corrupted state, meaning a state where continued normal operation can only be guaranteed via * clean process restart. * *

Currently considered fatal exceptions are Virtual Machine errors indicating that the JVM * is corrupted, like {@link InternalError}, {@link UnknownError}, and {@link * java.util.zip.ZipError} (a special case of InternalError). The {@link ThreadDeath} exception * is also treated as a fatal error, because when a thread is forcefully stopped, there is a * high chance that parts of the system are in an inconsistent state. * * @param t The exception to check. * @return True, if the exception is considered fatal to the JVM, false otherwise. */ public static boolean isJvmFatalError(Throwable t) { return (t instanceof InternalError) || (t instanceof UnknownError) || (t instanceof ThreadDeath); } /** * Checks whether the given exception indicates a situation that may leave the JVM in a * corrupted state, or an out-of-memory error. * *

See {@link ExceptionUtils#isJvmFatalError(Throwable)} for a list of fatal JVM errors. This * method additionally classifies the {@link OutOfMemoryError} as fatal, because it may occur in * any thread (not the one that allocated the majority of the memory) and thus is often not * recoverable by destroying the particular thread that threw the exception. * * @param t The exception to check. * @return True, if the exception is fatal to the JVM or and OutOfMemoryError, false otherwise. */ public static boolean isJvmFatalOrOutOfMemoryError(Throwable t) { return isJvmFatalError(t) || t instanceof OutOfMemoryError; } /** * Tries to enrich OutOfMemoryErrors being part of the passed root Throwable's cause tree. * *

This method improves error messages for direct and metaspace {@link OutOfMemoryError}. It * adds description about the possible causes and ways of resolution. * * @param root The Throwable of which the cause tree shall be traversed. * @param jvmMetaspaceOomNewErrorMessage The message being used for JVM metaspace-related * OutOfMemoryErrors. Passing null will disable handling this class of error. * @param jvmDirectOomNewErrorMessage The message being used for direct memory-related * OutOfMemoryErrors. Passing null will disable handling this class of error. * @param jvmHeapSpaceOomNewErrorMessage The message being used for Heap space-related * OutOfMemoryErrors. Passing null will disable handling this class of error. */ public static void tryEnrichOutOfMemoryError( @Nullable Throwable root, @Nullable String jvmMetaspaceOomNewErrorMessage, @Nullable String jvmDirectOomNewErrorMessage, @Nullable String jvmHeapSpaceOomNewErrorMessage) { updateDetailMessage( root, t -> { if (isMetaspaceOutOfMemoryError(t)) { return jvmMetaspaceOomNewErrorMessage; } else if (isDirectOutOfMemoryError(t)) { return jvmDirectOomNewErrorMessage; } else if (isHeapSpaceOutOfMemoryError(t)) { return jvmHeapSpaceOomNewErrorMessage; } return null; }); } /** * Updates error messages of Throwables appearing in the cause tree of the passed root * Throwable. The passed Function is applied on each Throwable of the cause tree. Returning a * String will cause the detailMessage of the corresponding Throwable to be updated. Returning * null, instead, won't trigger any detailMessage update on that Throwable. * * @param root The Throwable whose cause tree shall be traversed. * @param throwableToMessage The Function based on which the new messages are generated. The * function implementation should return the new message. Returning null, in * contrast, will result in not updating the message for the corresponding Throwable. */ public static void updateDetailMessage( @Nullable Throwable root, @Nullable Function throwableToMessage) { if (throwableToMessage == null) { return; } Throwable it = root; while (it != null) { String newMessage = throwableToMessage.apply(it); if (newMessage != null) { updateDetailMessageOfThrowable(it, newMessage); } it = it.getCause(); } } private static void updateDetailMessageOfThrowable( Throwable throwable, String newDetailMessage) { Field field; try { field = Throwable.class.getDeclaredField("detailMessage"); } catch (NoSuchFieldException e) { throw new IllegalStateException( "The JDK Throwable contains a detailMessage member. The Throwable class provided on the classpath does not which is why this exception appears.", e); } field.setAccessible(true); try { field.set(throwable, newDetailMessage); } catch (IllegalAccessException e) { throw new IllegalStateException( "The JDK Throwable contains a private detailMessage member that should be accessible through reflection. This is not the case for the Throwable class provided on the classpath.", e); } } /** * Checks whether the given exception indicates a JVM metaspace out-of-memory error. * * @param t The exception to check. * @return True, if the exception is the metaspace {@link OutOfMemoryError}, false otherwise. */ public static boolean isMetaspaceOutOfMemoryError(@Nullable Throwable t) { return isOutOfMemoryErrorWithMessageStartingWith(t, "Metaspace"); } /** * Checks whether the given exception indicates a JVM direct out-of-memory error. * * @param t The exception to check. * @return True, if the exception is the direct {@link OutOfMemoryError}, false otherwise. */ public static boolean isDirectOutOfMemoryError(@Nullable Throwable t) { return isOutOfMemoryErrorWithMessageStartingWith(t, "Direct buffer memory"); } public static boolean isHeapSpaceOutOfMemoryError(@Nullable Throwable t) { return isOutOfMemoryErrorWithMessageStartingWith(t, "Java heap space"); } private static boolean isOutOfMemoryErrorWithMessageStartingWith( @Nullable Throwable t, String prefix) { // the exact matching of the class is checked to avoid matching any custom subclasses of // OutOfMemoryError // as we are interested in the original exceptions, generated by JVM. return isOutOfMemoryError(t) && t.getMessage() != null && t.getMessage().startsWith(prefix); } private static boolean isOutOfMemoryError(@Nullable Throwable t) { return t != null && t.getClass() == OutOfMemoryError.class; } /** * Rethrows the given {@code Throwable}, if it represents an error that is fatal to the JVM. See * {@link ExceptionUtils#isJvmFatalError(Throwable)} for a definition of fatal errors. * * @param t The Throwable to check and rethrow. */ public static void rethrowIfFatalError(Throwable t) { if (isJvmFatalError(t)) { throw (Error) t; } } /** * Rethrows the given {@code Throwable}, if it represents an error that is fatal to the JVM or * an out-of-memory error. See {@link ExceptionUtils#isJvmFatalError(Throwable)} for a * definition of fatal errors. * * @param t The Throwable to check and rethrow. */ public static void rethrowIfFatalErrorOrOOM(Throwable t) { if (isJvmFatalError(t) || t instanceof OutOfMemoryError) { throw (Error) t; } } /** * Adds a new exception as a {@link Throwable#addSuppressed(Throwable) suppressed exception} to * a prior exception, or returns the new exception, if no prior exception exists. * *

{@code
     * public void closeAllThings() throws Exception {
     *     Exception ex = null;
     *     try {
     *         component.shutdown();
     *     } catch (Exception e) {
     *         ex = firstOrSuppressed(e, ex);
     *     }
     *     try {
     *         anotherComponent.stop();
     *     } catch (Exception e) {
     *         ex = firstOrSuppressed(e, ex);
     *     }
     *     try {
     *         lastComponent.shutdown();
     *     } catch (Exception e) {
     *         ex = firstOrSuppressed(e, ex);
     *     }
     *
     *     if (ex != null) {
     *         throw ex;
     *     }
     * }
     * }
* * @param newException The newly occurred exception * @param previous The previously occurred exception, possibly null. * @return The new exception, if no previous exception exists, or the previous exception with * the new exception in the list of suppressed exceptions. */ public static T firstOrSuppressed(T newException, @Nullable T previous) { checkNotNull(newException, "newException"); if (previous == null) { return newException; } else { previous.addSuppressed(newException); return previous; } } /** * Throws the given {@code Throwable} in scenarios where the signatures do not allow you to * throw an arbitrary Throwable. Errors and RuntimeExceptions are thrown directly, other * exceptions are packed into runtime exceptions * * @param t The throwable to be thrown. */ public static void rethrow(Throwable t) { if (t instanceof Error) { throw (Error) t; } else if (t instanceof RuntimeException) { throw (RuntimeException) t; } else { throw new RuntimeException(t); } } /** * Throws the given {@code Throwable} in scenarios where the signatures do not allow you to * throw an arbitrary Throwable. Errors and RuntimeExceptions are thrown directly, other * exceptions are packed into a parent RuntimeException. * * @param t The throwable to be thrown. * @param parentMessage The message for the parent RuntimeException, if one is needed. */ public static void rethrow(Throwable t, String parentMessage) { if (t instanceof Error) { throw (Error) t; } else if (t instanceof RuntimeException) { throw (RuntimeException) t; } else { throw new RuntimeException(parentMessage, t); } } /** * Throws the given {@code Throwable} in scenarios where the signatures do allow to throw a * Exception. Errors and Exceptions are thrown directly, other "exotic" subclasses of Throwable * are wrapped in an Exception. * * @param t The throwable to be thrown. * @param parentMessage The message for the parent Exception, if one is needed. */ public static void rethrowException(Throwable t, String parentMessage) throws Exception { if (t instanceof Error) { throw (Error) t; } else if (t instanceof Exception) { throw (Exception) t; } else { throw new Exception(parentMessage, t); } } /** * Throws the given {@code Throwable} in scenarios where the signatures do allow to throw a * Exception. Errors and Exceptions are thrown directly, other "exotic" subclasses of Throwable * are wrapped in an Exception. * * @param t The throwable to be thrown. */ public static void rethrowException(Throwable t) throws Exception { if (t instanceof Error) { throw (Error) t; } else if (t instanceof Exception) { throw (Exception) t; } else { throw new Exception(t.getMessage(), t); } } /** * Tries to throw the given exception if not null. * * @param e exception to throw if not null. * @throws Exception */ public static void tryRethrowException(@Nullable Exception e) throws Exception { if (e != null) { throw e; } } /** * Tries to throw the given {@code Throwable} in scenarios where the signatures allows only * IOExceptions (and RuntimeException and Error). Throws this exception directly, if it is an * IOException, a RuntimeException, or an Error. Otherwise does nothing. * * @param t The Throwable to be thrown. */ public static void tryRethrowIOException(Throwable t) throws IOException { if (t instanceof IOException) { throw (IOException) t; } else if (t instanceof RuntimeException) { throw (RuntimeException) t; } else if (t instanceof Error) { throw (Error) t; } } /** * Re-throws the given {@code Throwable} in scenarios where the signatures allows only * IOExceptions (and RuntimeException and Error). * *

Throws this exception directly, if it is an IOException, a RuntimeException, or an Error. * Otherwise it wraps it in an IOException and throws it. * * @param t The Throwable to be thrown. */ public static void rethrowIOException(Throwable t) throws IOException { if (t instanceof IOException) { throw (IOException) t; } else if (t instanceof RuntimeException) { throw (RuntimeException) t; } else if (t instanceof Error) { throw (Error) t; } else { throw new IOException(t.getMessage(), t); } } /** * Checks whether a throwable chain contains a specific type of exception and returns it. It * deserializes any {@link SerializedThrowable} that are found using the provided {@link * ClassLoader}. * * @param throwable the throwable chain to check. * @param searchType the type of exception to search for in the chain. * @param classLoader to use for deserialization. * @return Optional throwable of the requested type if available, otherwise empty */ public static Optional findSerializedThrowable( Throwable throwable, Class searchType, ClassLoader classLoader) { if (throwable == null || searchType == null) { return Optional.empty(); } Throwable t = throwable; while (t != null) { if (searchType.isAssignableFrom(t.getClass())) { return Optional.of(searchType.cast(t)); } else if (t.getClass().isAssignableFrom(SerializedThrowable.class)) { Throwable next = ((SerializedThrowable) t).deserializeError(classLoader); // SerializedThrowable#deserializeError returns itself under some conditions (e.g., // null cause). // If that happens, exit to avoid looping infinitely. This is ok because if the user // was searching // for a SerializedThrowable, we would have returned it in the initial if condition. t = (next == t) ? null : next; } else { t = t.getCause(); } } return Optional.empty(); } /** * Checks whether a throwable chain contains a specific type of exception and returns it. * * @param throwable the throwable chain to check. * @param searchType the type of exception to search for in the chain. * @return Optional throwable of the requested type if available, otherwise empty */ public static Optional findThrowable( Throwable throwable, Class searchType) { if (throwable == null || searchType == null) { return Optional.empty(); } // TODO: ------------ start:二次开发代码 --------------- // ExceptionBus.post(throwable, ""); // TODO: ------------ end:二次开发代码 --------------- // Throwable t = throwable; while (t != null) { if (searchType.isAssignableFrom(t.getClass())) { return Optional.of(searchType.cast(t)); } else { t = t.getCause(); } } return Optional.empty(); } /** * Checks whether a throwable chain contains a specific type of exception and returns it. This * method handles {@link SerializedThrowable}s in the chain and deserializes them with the given * ClassLoader. * *

SerializedThrowables are often used when exceptions might come from dynamically loaded * code and be transported over RPC / HTTP for better error reporting. The receiving processes * or threads might not have the dynamically loaded code available. * * @param throwable the throwable chain to check. * @param searchType the type of exception to search for in the chain. * @param classLoader the ClassLoader to use when encountering a SerializedThrowable. * @return Optional throwable of the requested type if available, otherwise empty */ public static Optional findThrowableSerializedAware( Throwable throwable, Class searchType, ClassLoader classLoader) { if (throwable == null || searchType == null) { return Optional.empty(); } Throwable t = throwable; while (t != null) { if (searchType.isAssignableFrom(t.getClass())) { return Optional.of(searchType.cast(t)); } else if (t instanceof SerializedThrowable) { t = ((SerializedThrowable) t).deserializeError(classLoader); } else { t = t.getCause(); } } return Optional.empty(); } /** * Checks whether a throwable chain contains an exception matching a predicate and returns it. * * @param throwable the throwable chain to check. * @param predicate the predicate of the exception to search for in the chain. * @return Optional throwable of the requested type if available, otherwise empty */ public static Optional findThrowable( Throwable throwable, Predicate predicate) { if (throwable == null || predicate == null) { return Optional.empty(); } Throwable t = throwable; while (t != null) { if (predicate.test(t)) { return Optional.of(t); } else { t = t.getCause(); } } return Optional.empty(); } /** * Checks whether a throwable chain contains a specific error message and returns the * corresponding throwable. * * @param throwable the throwable chain to check. * @param searchMessage the error message to search for in the chain. * @return Optional throwable containing the search message if available, otherwise empty */ public static Optional findThrowableWithMessage( Throwable throwable, String searchMessage) { if (throwable == null || searchMessage == null) { return Optional.empty(); } Throwable t = throwable; while (t != null) { if (t.getMessage() != null && t.getMessage().contains(searchMessage)) { return Optional.of(t); } else { t = t.getCause(); } } return Optional.empty(); } /** * Unpacks an {@link ExecutionException} and returns its cause. Otherwise the given Throwable is * returned. * * @param throwable to unpack if it is an ExecutionException * @return Cause of ExecutionException or given Throwable */ public static Throwable stripExecutionException(Throwable throwable) { return stripException(throwable, ExecutionException.class); } /** * Unpacks an {@link CompletionException} and returns its cause. Otherwise the given Throwable * is returned. * * @param throwable to unpack if it is an CompletionException * @return Cause of CompletionException or given Throwable */ public static Throwable stripCompletionException(Throwable throwable) { return stripException(throwable, CompletionException.class); } /** * Unpacks an specified exception and returns its cause. Otherwise the given {@link Throwable} * is returned. * * @param throwableToStrip to strip * @param typeToStrip type to strip * @return Unpacked cause or given Throwable if not packed */ public static Throwable stripException( Throwable throwableToStrip, Class typeToStrip) { while (typeToStrip.isAssignableFrom(throwableToStrip.getClass()) && throwableToStrip.getCause() != null) { throwableToStrip = throwableToStrip.getCause(); } return throwableToStrip; } /** * Tries to find a {@link SerializedThrowable} as the cause of the given throwable and throws * its deserialized value. If there is no such throwable, then the original throwable is thrown. * * @param throwable to check for a SerializedThrowable * @param classLoader to be used for the deserialization of the SerializedThrowable * @throws Throwable either the deserialized throwable or the given throwable */ public static void tryDeserializeAndThrow(Throwable throwable, ClassLoader classLoader) throws Throwable { Throwable current = throwable; while (!(current instanceof SerializedThrowable) && current.getCause() != null) { current = current.getCause(); } if (current instanceof SerializedThrowable) { throw ((SerializedThrowable) current).deserializeError(classLoader); } else { throw throwable; } } /** * Checks whether the given exception is a {@link InterruptedException} and sets the interrupted * flag accordingly. * * @param e to check whether it is an {@link InterruptedException} */ public static void checkInterrupted(Throwable e) { if (e instanceof InterruptedException) { Thread.currentThread().interrupt(); } } // ------------------------------------------------------------------------ // Lambda exception utilities // ------------------------------------------------------------------------ public static void suppressExceptions(RunnableWithException action) { try { action.run(); } catch (InterruptedException e) { // restore interrupted state Thread.currentThread().interrupt(); } catch (Throwable t) { if (isJvmFatalError(t)) { rethrow(t); } } } // ------------------------------------------------------------------------ /** Private constructor to prevent instantiation. */ private ExceptionUtils() {} } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.12/org/rocksdb/RocksDB.java ================================================ // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). package org.rocksdb; import com.zto.fire.common.util.PropUtils; import org.rocksdb.util.Environment; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.*; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; /** * A RocksDB is a persistent ordered map from keys to values. It is safe for * concurrent access from multiple threads without any external synchronization. * All methods of this class could potentially throw RocksDBException, which * indicates sth wrong at the RocksDB library side and the call failed. */ public class RocksDB extends RocksObject { public static final byte[] DEFAULT_COLUMN_FAMILY = "default".getBytes(); public static final int NOT_FOUND = -1; // TODO: ------------ start:二次开发代码 --------------- // // 当状态获取耗时超过该阈值时将记录日志,小于1表示不记录日志 protected long logThreshold = PropUtils.getLong("flink.state.log.threshold", 50, 1); // 当状态获取耗时超过该阈值时将记录日志的日志条数,小于1表示不限行数 protected long logThresholdMaxCount = PropUtils.getLong("flink.state.log.threshold.max_count", 300000, 1); protected AtomicLong currentLogCount = new AtomicLong(); protected static Logger logger = LoggerFactory.getLogger(RocksDB.class); /** * 用于计算状态获取的耗时 */ protected void elapsed(long start) { long elapsed = System.currentTimeMillis() - start; // 当且仅当以下两个条件满足时才会记录耗时日志: // 1. 当flink.state.log.threshold配置的阈值大于0时 // 2. 当flink.state.log.threshold.max_count配置的值小于1时或者当记录的行数小于配置的值时 if (this.logThreshold > 0 && (this.logThresholdMaxCount < 1 || this.currentLogCount.get() <= this.logThresholdMaxCount)) { if (elapsed >= this.logThreshold * 2) { logger.warn("RocksDB state get elapsed:{}ms.", elapsed); this.currentLogCount.incrementAndGet(); } else if (elapsed >= this.logThreshold) { logger.info("RocksDB state get elapsed:{}ms.", elapsed); this.currentLogCount.incrementAndGet(); } } } // TODO: ------------ end:二次开发代码 --------------- // private enum LibraryState { NOT_LOADED, LOADING, LOADED } private static AtomicReference libraryLoaded = new AtomicReference<>(LibraryState.NOT_LOADED); static { RocksDB.loadLibrary(); } /** * Loads the necessary library files. * Calling this method twice will have no effect. * By default the method extracts the shared library for loading at * java.io.tmpdir, however, you can override this temporary location by * setting the environment variable ROCKSDB_SHAREDLIB_DIR. */ public static void loadLibrary() { if (libraryLoaded.get() == LibraryState.LOADED) { return; } if (libraryLoaded.compareAndSet(LibraryState.NOT_LOADED, LibraryState.LOADING)) { final String tmpDir = System.getenv("ROCKSDB_SHAREDLIB_DIR"); // loading possibly necessary libraries. for (final CompressionType compressionType : CompressionType.values()) { try { if (compressionType.getLibraryName() != null) { System.loadLibrary(compressionType.getLibraryName()); } } catch (UnsatisfiedLinkError e) { // since it may be optional, we ignore its loading failure here. } } try { NativeLibraryLoader.getInstance().loadLibrary(tmpDir); } catch (IOException e) { libraryLoaded.set(LibraryState.NOT_LOADED); throw new RuntimeException("Unable to load the RocksDB shared library" + e); } libraryLoaded.set(LibraryState.LOADED); return; } while (libraryLoaded.get() == LibraryState.LOADING) { try { Thread.sleep(10); } catch(final InterruptedException e) { //ignore } } } /** * Tries to load the necessary library files from the given list of * directories. * * @param paths a list of strings where each describes a directory * of a library. */ public static void loadLibrary(final List paths) { if (libraryLoaded.get() == LibraryState.LOADED) { return; } if (libraryLoaded.compareAndSet(LibraryState.NOT_LOADED, LibraryState.LOADING)) { for (final CompressionType compressionType : CompressionType.values()) { if (compressionType.equals(CompressionType.NO_COMPRESSION)) { continue; } for (final String path : paths) { try { System.load(path + "/" + Environment.getSharedLibraryFileName( compressionType.getLibraryName())); break; } catch (UnsatisfiedLinkError e) { // since they are optional, we ignore loading fails. } } } boolean success = false; UnsatisfiedLinkError err = null; for (final String path : paths) { try { System.load(path + "/" + Environment.getJniLibraryFileName("rocksdbjni")); success = true; break; } catch (UnsatisfiedLinkError e) { err = e; } } if (!success) { libraryLoaded.set(LibraryState.NOT_LOADED); throw err; } libraryLoaded.set(LibraryState.LOADED); return; } while (libraryLoaded.get() == LibraryState.LOADING) { try { Thread.sleep(10); } catch(final InterruptedException e) { //ignore } } } /** * The factory constructor of RocksDB that opens a RocksDB instance given * the path to the database using the default options w/ createIfMissing * set to true. * * @param path the path to the rocksdb. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. * @see Options#setCreateIfMissing(boolean) */ public static RocksDB open(final String path) throws RocksDBException { // This allows to use the rocksjni default Options instead of // the c++ one. Options options = new Options(); options.setCreateIfMissing(true); return open(options, path); } /** * The factory constructor of RocksDB that opens a RocksDB instance given * the path to the database using the specified options and db path and a list * of column family names. *

* If opened in read write mode every existing column family name must be * passed within the list to this method.

*

* If opened in read-only mode only a subset of existing column families must * be passed to this method.

*

* Options instance *should* not be disposed before all DBs using this options * instance have been closed. If user doesn't call options dispose explicitly, * then this options instance will be GC'd automatically

*

* ColumnFamily handles are disposed when the RocksDB instance is disposed. *

* * @param path the path to the rocksdb. * @param columnFamilyDescriptors list of column family descriptors * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances * on open. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. * @see DBOptions#setCreateIfMissing(boolean) */ public static RocksDB open(final String path, final List columnFamilyDescriptors, final List columnFamilyHandles) throws RocksDBException { // This allows to use the rocksjni default Options instead of // the c++ one. DBOptions options = new DBOptions(); return open(options, path, columnFamilyDescriptors, columnFamilyHandles); } /** * The factory constructor of RocksDB that opens a RocksDB instance given * the path to the database using the specified options and db path. * *

* Options instance *should* not be disposed before all DBs using this options * instance have been closed. If user doesn't call options dispose explicitly, * then this options instance will be GC'd automatically.

*

* Options instance can be re-used to open multiple DBs if DB statistics is * not used. If DB statistics are required, then its recommended to open DB * with new Options instance as underlying native statistics instance does not * use any locks to prevent concurrent updates.

* * @param options {@link org.rocksdb.Options} instance. * @param path the path to the rocksdb. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. * * @see Options#setCreateIfMissing(boolean) */ public static RocksDB open(final Options options, final String path) throws RocksDBException { // when non-default Options is used, keeping an Options reference // in RocksDB can prevent Java to GC during the life-time of // the currently-created RocksDB. final RocksDB db = new RocksDB(open(options.nativeHandle_, path)); db.storeOptionsInstance(options); return db; } /** * The factory constructor of RocksDB that opens a RocksDB instance given * the path to the database using the specified options and db path and a list * of column family names. *

* If opened in read write mode every existing column family name must be * passed within the list to this method.

*

* If opened in read-only mode only a subset of existing column families must * be passed to this method.

*

* Options instance *should* not be disposed before all DBs using this options * instance have been closed. If user doesn't call options dispose explicitly, * then this options instance will be GC'd automatically.

*

* Options instance can be re-used to open multiple DBs if DB statistics is * not used. If DB statistics are required, then its recommended to open DB * with new Options instance as underlying native statistics instance does not * use any locks to prevent concurrent updates.

*

* ColumnFamily handles are disposed when the RocksDB instance is disposed. *

* * @param options {@link org.rocksdb.DBOptions} instance. * @param path the path to the rocksdb. * @param columnFamilyDescriptors list of column family descriptors * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances * on open. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. * * @see DBOptions#setCreateIfMissing(boolean) */ public static RocksDB open(final DBOptions options, final String path, final List columnFamilyDescriptors, final List columnFamilyHandles) throws RocksDBException { final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][]; final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()]; for (int i = 0; i < columnFamilyDescriptors.size(); i++) { final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors .get(i); cfNames[i] = cfDescriptor.columnFamilyName(); cfOptionHandles[i] = cfDescriptor.columnFamilyOptions().nativeHandle_; } final long[] handles = open(options.nativeHandle_, path, cfNames, cfOptionHandles); final RocksDB db = new RocksDB(handles[0]); db.storeOptionsInstance(options); for (int i = 1; i < handles.length; i++) { columnFamilyHandles.add(new ColumnFamilyHandle(db, handles[i])); } return db; } /** * The factory constructor of RocksDB that opens a RocksDB instance in * Read-Only mode given the path to the database using the default * options. * * @param path the path to the RocksDB. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public static RocksDB openReadOnly(final String path) throws RocksDBException { // This allows to use the rocksjni default Options instead of // the c++ one. Options options = new Options(); return openReadOnly(options, path); } /** * The factory constructor of RocksDB that opens a RocksDB instance in * Read-Only mode given the path to the database using the default * options. * * @param path the path to the RocksDB. * @param columnFamilyDescriptors list of column family descriptors * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances * on open. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public static RocksDB openReadOnly(final String path, final List columnFamilyDescriptors, final List columnFamilyHandles) throws RocksDBException { // This allows to use the rocksjni default Options instead of // the c++ one. final DBOptions options = new DBOptions(); return openReadOnly(options, path, columnFamilyDescriptors, columnFamilyHandles); } /** * The factory constructor of RocksDB that opens a RocksDB instance in * Read-Only mode given the path to the database using the specified * options and db path. * * Options instance *should* not be disposed before all DBs using this options * instance have been closed. If user doesn't call options dispose explicitly, * then this options instance will be GC'd automatically. * * @param options {@link Options} instance. * @param path the path to the RocksDB. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public static RocksDB openReadOnly(final Options options, final String path) throws RocksDBException { // when non-default Options is used, keeping an Options reference // in RocksDB can prevent Java to GC during the life-time of // the currently-created RocksDB. final RocksDB db = new RocksDB(openROnly(options.nativeHandle_, path)); db.storeOptionsInstance(options); return db; } /** * The factory constructor of RocksDB that opens a RocksDB instance in * Read-Only mode given the path to the database using the specified * options and db path. * *

This open method allows to open RocksDB using a subset of available * column families

*

Options instance *should* not be disposed before all DBs using this * options instance have been closed. If user doesn't call options dispose * explicitly,then this options instance will be GC'd automatically.

* * @param options {@link DBOptions} instance. * @param path the path to the RocksDB. * @param columnFamilyDescriptors list of column family descriptors * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances * on open. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public static RocksDB openReadOnly(final DBOptions options, final String path, final List columnFamilyDescriptors, final List columnFamilyHandles) throws RocksDBException { // when non-default Options is used, keeping an Options reference // in RocksDB can prevent Java to GC during the life-time of // the currently-created RocksDB. final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][]; final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()]; for (int i = 0; i < columnFamilyDescriptors.size(); i++) { final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors .get(i); cfNames[i] = cfDescriptor.columnFamilyName(); cfOptionHandles[i] = cfDescriptor.columnFamilyOptions().nativeHandle_; } final long[] handles = openROnly(options.nativeHandle_, path, cfNames, cfOptionHandles); final RocksDB db = new RocksDB(handles[0]); db.storeOptionsInstance(options); for (int i = 1; i < handles.length; i++) { columnFamilyHandles.add(new ColumnFamilyHandle(db, handles[i])); } return db; } /** * Static method to determine all available column families for a * rocksdb database identified by path * * @param options Options for opening the database * @param path Absolute path to rocksdb database * @return List<byte[]> List containing the column family names * * @throws RocksDBException thrown if error happens in underlying * native library. */ public static List listColumnFamilies(final Options options, final String path) throws RocksDBException { return Arrays.asList(RocksDB.listColumnFamilies(options.nativeHandle_, path)); } protected void storeOptionsInstance(DBOptionsInterface options) { options_ = options; } /** * Set the database entry for "key" to "value". * * @param key the specified key to be inserted. * @param value the value associated with the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void put(final byte[] key, final byte[] value) throws RocksDBException { put(nativeHandle_, key, 0, key.length, value, 0, value.length); } /** * Set the database entry for "key" to "value" in the specified * column family. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param key the specified key to be inserted. * @param value the value associated with the specified key. * * throws IllegalArgumentException if column family is not present * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final byte[] value) throws RocksDBException { put(nativeHandle_, key, 0, key.length, value, 0, value.length, columnFamilyHandle.nativeHandle_); } /** * Set the database entry for "key" to "value". * * @param writeOpts {@link org.rocksdb.WriteOptions} instance. * @param key the specified key to be inserted. * @param value the value associated with the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void put(final WriteOptions writeOpts, final byte[] key, final byte[] value) throws RocksDBException { put(nativeHandle_, writeOpts.nativeHandle_, key, 0, key.length, value, 0, value.length); } /** * Set the database entry for "key" to "value" for the specified * column family. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param writeOpts {@link org.rocksdb.WriteOptions} instance. * @param key the specified key to be inserted. * @param value the value associated with the specified key. * * throws IllegalArgumentException if column family is not present * * @throws RocksDBException thrown if error happens in underlying * native library. * @see IllegalArgumentException */ public void put(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpts, final byte[] key, final byte[] value) throws RocksDBException { put(nativeHandle_, writeOpts.nativeHandle_, key, 0, key.length, value, 0, value.length, columnFamilyHandle.nativeHandle_); } /** * If the key definitely does not exist in the database, then this method * returns false, else true. * * This check is potentially lighter-weight than invoking DB::Get(). One way * to make this lighter weight is to avoid doing any IOs. * * @param key byte array of a key to search for * @param value StringBuilder instance which is a out parameter if a value is * found in block-cache. * @return boolean value indicating if key does not exist or might exist. */ public boolean keyMayExist(final byte[] key, final StringBuilder value) { return keyMayExist(nativeHandle_, key, 0, key.length, value); } /** * If the key definitely does not exist in the database, then this method * returns false, else true. * * This check is potentially lighter-weight than invoking DB::Get(). One way * to make this lighter weight is to avoid doing any IOs. * * @param columnFamilyHandle {@link ColumnFamilyHandle} instance * @param key byte array of a key to search for * @param value StringBuilder instance which is a out parameter if a value is * found in block-cache. * @return boolean value indicating if key does not exist or might exist. */ public boolean keyMayExist(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final StringBuilder value) { return keyMayExist(nativeHandle_, key, 0, key.length, columnFamilyHandle.nativeHandle_, value); } /** * If the key definitely does not exist in the database, then this method * returns false, else true. * * This check is potentially lighter-weight than invoking DB::Get(). One way * to make this lighter weight is to avoid doing any IOs. * * @param readOptions {@link ReadOptions} instance * @param key byte array of a key to search for * @param value StringBuilder instance which is a out parameter if a value is * found in block-cache. * @return boolean value indicating if key does not exist or might exist. */ public boolean keyMayExist(final ReadOptions readOptions, final byte[] key, final StringBuilder value) { return keyMayExist(nativeHandle_, readOptions.nativeHandle_, key, 0, key.length, value); } /** * If the key definitely does not exist in the database, then this method * returns false, else true. * * This check is potentially lighter-weight than invoking DB::Get(). One way * to make this lighter weight is to avoid doing any IOs. * * @param readOptions {@link ReadOptions} instance * @param columnFamilyHandle {@link ColumnFamilyHandle} instance * @param key byte array of a key to search for * @param value StringBuilder instance which is a out parameter if a value is * found in block-cache. * @return boolean value indicating if key does not exist or might exist. */ public boolean keyMayExist(final ReadOptions readOptions, final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final StringBuilder value) { return keyMayExist(nativeHandle_, readOptions.nativeHandle_, key, 0, key.length, columnFamilyHandle.nativeHandle_, value); } /** * Apply the specified updates to the database. * * @param writeOpts WriteOptions instance * @param updates WriteBatch instance * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void write(final WriteOptions writeOpts, final WriteBatch updates) throws RocksDBException { write0(nativeHandle_, writeOpts.nativeHandle_, updates.nativeHandle_); } /** * Apply the specified updates to the database. * * @param writeOpts WriteOptions instance * @param updates WriteBatchWithIndex instance * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void write(final WriteOptions writeOpts, final WriteBatchWithIndex updates) throws RocksDBException { write1(nativeHandle_, writeOpts.nativeHandle_, updates.nativeHandle_); } /** * Add merge operand for key/value pair. * * @param key the specified key to be merged. * @param value the value to be merged with the current value for * the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void merge(final byte[] key, final byte[] value) throws RocksDBException { merge(nativeHandle_, key, 0, key.length, value, 0, value.length); } /** * Add merge operand for key/value pair in a ColumnFamily. * * @param columnFamilyHandle {@link ColumnFamilyHandle} instance * @param key the specified key to be merged. * @param value the value to be merged with the current value for * the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void merge(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final byte[] value) throws RocksDBException { merge(nativeHandle_, key, 0, key.length, value, 0, value.length, columnFamilyHandle.nativeHandle_); } /** * Add merge operand for key/value pair. * * @param writeOpts {@link WriteOptions} for this write. * @param key the specified key to be merged. * @param value the value to be merged with the current value for * the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void merge(final WriteOptions writeOpts, final byte[] key, final byte[] value) throws RocksDBException { merge(nativeHandle_, writeOpts.nativeHandle_, key, 0, key.length, value, 0, value.length); } /** * Add merge operand for key/value pair. * * @param columnFamilyHandle {@link ColumnFamilyHandle} instance * @param writeOpts {@link WriteOptions} for this write. * @param key the specified key to be merged. * @param value the value to be merged with the current value for * the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void merge(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpts, final byte[] key, final byte[] value) throws RocksDBException { merge(nativeHandle_, writeOpts.nativeHandle_, key, 0, key.length, value, 0, value.length, columnFamilyHandle.nativeHandle_); } // TODO(AR) we should improve the #get() API, returning -1 (RocksDB.NOT_FOUND) is not very nice // when we could communicate better status into, also the C++ code show that -2 could be returned /** * Get the value associated with the specified key within column family* * @param key the key to retrieve the value. * @param value the out-value to receive the retrieved value. * @return The size of the actual value that matches the specified * {@code key} in byte. If the return value is greater than the * length of {@code value}, then it indicates that the size of the * input buffer {@code value} is insufficient and partial result will * be returned. RocksDB.NOT_FOUND will be returned if the value not * found. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public int get(final byte[] key, final byte[] value) throws RocksDBException { return get(nativeHandle_, key, 0, key.length, value, 0, value.length); } /** * Get the value associated with the specified key within column family. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param key the key to retrieve the value. * @param value the out-value to receive the retrieved value. * @return The size of the actual value that matches the specified * {@code key} in byte. If the return value is greater than the * length of {@code value}, then it indicates that the size of the * input buffer {@code value} is insufficient and partial result will * be returned. RocksDB.NOT_FOUND will be returned if the value not * found. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public int get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final byte[] value) throws RocksDBException, IllegalArgumentException { return get(nativeHandle_, key, 0, key.length, value, 0, value.length, columnFamilyHandle.nativeHandle_); } /** * Get the value associated with the specified key. * * @param opt {@link org.rocksdb.ReadOptions} instance. * @param key the key to retrieve the value. * @param value the out-value to receive the retrieved value. * @return The size of the actual value that matches the specified * {@code key} in byte. If the return value is greater than the * length of {@code value}, then it indicates that the size of the * input buffer {@code value} is insufficient and partial result will * be returned. RocksDB.NOT_FOUND will be returned if the value not * found. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public int get(final ReadOptions opt, final byte[] key, final byte[] value) throws RocksDBException { return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length, value, 0, value.length); } /** * Get the value associated with the specified key within column family. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param opt {@link org.rocksdb.ReadOptions} instance. * @param key the key to retrieve the value. * @param value the out-value to receive the retrieved value. * @return The size of the actual value that matches the specified * {@code key} in byte. If the return value is greater than the * length of {@code value}, then it indicates that the size of the * input buffer {@code value} is insufficient and partial result will * be returned. RocksDB.NOT_FOUND will be returned if the value not * found. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public int get(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions opt, final byte[] key, final byte[] value) throws RocksDBException { return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length, value, 0, value.length, columnFamilyHandle.nativeHandle_); } /** * The simplified version of get which returns a new byte array storing * the value associated with the specified input key if any. null will be * returned if the specified key is not found. * * @param key the key retrieve the value. * @return a byte array storing the value associated with the input key if * any. null if it does not find the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public byte[] get(final byte[] key) throws RocksDBException { return get(nativeHandle_, key, 0, key.length); } /** * The simplified version of get which returns a new byte array storing * the value associated with the specified input key if any. null will be * returned if the specified key is not found. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param key the key retrieve the value. * @return a byte array storing the value associated with the input key if * any. null if it does not find the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public byte[] get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) throws RocksDBException { // TODO: ------------ start:二次开发代码 --------------- // long start = System.currentTimeMillis(); byte[] state = get(nativeHandle_, key, 0, key.length, columnFamilyHandle.nativeHandle_); this.elapsed(start); // TODO: ------------ end:二次开发代码 --------------- // return state; } /** * The simplified version of get which returns a new byte array storing * the value associated with the specified input key if any. null will be * returned if the specified key is not found. * * @param key the key retrieve the value. * @param opt Read options. * @return a byte array storing the value associated with the input key if * any. null if it does not find the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public byte[] get(final ReadOptions opt, final byte[] key) throws RocksDBException { return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length); } /** * The simplified version of get which returns a new byte array storing * the value associated with the specified input key if any. null will be * returned if the specified key is not found. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param key the key retrieve the value. * @param opt Read options. * @return a byte array storing the value associated with the input key if * any. null if it does not find the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public byte[] get(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions opt, final byte[] key) throws RocksDBException { return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length, columnFamilyHandle.nativeHandle_); } /** * Returns a map of keys for which values were found in DB. * * @param keys List of keys for which values need to be retrieved. * @return Map where key of map is the key passed by user and value for map * entry is the corresponding value in DB. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public Map multiGet(final List keys) throws RocksDBException { assert(keys.size() != 0); final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); final int keyOffsets[] = new int[keysArray.length]; final int keyLengths[] = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } final byte[][] values = multiGet(nativeHandle_, keysArray, keyOffsets, keyLengths); final Map keyValueMap = new HashMap<>(computeCapacityHint(values.length)); for(int i = 0; i < values.length; i++) { if(values[i] == null) { continue; } keyValueMap.put(keys.get(i), values[i]); } return keyValueMap; } private static int computeCapacityHint(final int estimatedNumberOfItems) { // Default load factor for HashMap is 0.75, so N * 1.5 will be at the load // limit. We add +1 for a buffer. return (int)Math.ceil(estimatedNumberOfItems * 1.5 + 1.0); } /** * Returns a map of keys for which values were found in DB. *

* Note: Every key needs to have a related column family name in * {@code columnFamilyHandleList}. *

* * @param columnFamilyHandleList {@link java.util.List} containing * {@link org.rocksdb.ColumnFamilyHandle} instances. * @param keys List of keys for which values need to be retrieved. * @return Map where key of map is the key passed by user and value for map * entry is the corresponding value in DB. * * @throws RocksDBException thrown if error happens in underlying * native library. * @throws IllegalArgumentException thrown if the size of passed keys is not * equal to the amount of passed column family handles. */ public Map multiGet( final List columnFamilyHandleList, final List keys) throws RocksDBException, IllegalArgumentException { assert(keys.size() != 0); // Check if key size equals cfList size. If not a exception must be // thrown. If not a Segmentation fault happens. if (keys.size() != columnFamilyHandleList.size()) { throw new IllegalArgumentException( "For each key there must be a ColumnFamilyHandle."); } final long[] cfHandles = new long[columnFamilyHandleList.size()]; for (int i = 0; i < columnFamilyHandleList.size(); i++) { cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_; } final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); final int keyOffsets[] = new int[keysArray.length]; final int keyLengths[] = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } final byte[][] values = multiGet(nativeHandle_, keysArray, keyOffsets, keyLengths, cfHandles); final Map keyValueMap = new HashMap<>(computeCapacityHint(values.length)); for(int i = 0; i < values.length; i++) { if (values[i] == null) { continue; } keyValueMap.put(keys.get(i), values[i]); } return keyValueMap; } /** * Returns a map of keys for which values were found in DB. * * @param opt Read options. * @param keys of keys for which values need to be retrieved. * @return Map where key of map is the key passed by user and value for map * entry is the corresponding value in DB. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public Map multiGet(final ReadOptions opt, final List keys) throws RocksDBException { assert(keys.size() != 0); final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); final int keyOffsets[] = new int[keysArray.length]; final int keyLengths[] = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } final byte[][] values = multiGet(nativeHandle_, opt.nativeHandle_, keysArray, keyOffsets, keyLengths); final Map keyValueMap = new HashMap<>(computeCapacityHint(values.length)); for(int i = 0; i < values.length; i++) { if(values[i] == null) { continue; } keyValueMap.put(keys.get(i), values[i]); } return keyValueMap; } /** * Returns a map of keys for which values were found in DB. *

* Note: Every key needs to have a related column family name in * {@code columnFamilyHandleList}. *

* * @param opt Read options. * @param columnFamilyHandleList {@link java.util.List} containing * {@link org.rocksdb.ColumnFamilyHandle} instances. * @param keys of keys for which values need to be retrieved. * @return Map where key of map is the key passed by user and value for map * entry is the corresponding value in DB. * * @throws RocksDBException thrown if error happens in underlying * native library. * @throws IllegalArgumentException thrown if the size of passed keys is not * equal to the amount of passed column family handles. */ public Map multiGet(final ReadOptions opt, final List columnFamilyHandleList, final List keys) throws RocksDBException { assert(keys.size() != 0); // Check if key size equals cfList size. If not a exception must be // thrown. If not a Segmentation fault happens. if (keys.size()!=columnFamilyHandleList.size()){ throw new IllegalArgumentException( "For each key there must be a ColumnFamilyHandle."); } final long[] cfHandles = new long[columnFamilyHandleList.size()]; for (int i = 0; i < columnFamilyHandleList.size(); i++) { cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_; } final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); final int keyOffsets[] = new int[keysArray.length]; final int keyLengths[] = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } final byte[][] values = multiGet(nativeHandle_, opt.nativeHandle_, keysArray, keyOffsets, keyLengths, cfHandles); final Map keyValueMap = new HashMap<>(computeCapacityHint(values.length)); for(int i = 0; i < values.length; i++) { if(values[i] == null) { continue; } keyValueMap.put(keys.get(i), values[i]); } return keyValueMap; } /** * Remove the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. * * @deprecated Use {@link #delete(byte[])} */ @Deprecated public void remove(final byte[] key) throws RocksDBException { delete(key); } /** * Delete the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void delete(final byte[] key) throws RocksDBException { delete(nativeHandle_, key, 0, key.length); } /** * Remove the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. * * @deprecated Use {@link #delete(ColumnFamilyHandle, byte[])} */ @Deprecated public void remove(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) throws RocksDBException { delete(columnFamilyHandle, key); } /** * Delete the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void delete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) throws RocksDBException { delete(nativeHandle_, key, 0, key.length, columnFamilyHandle.nativeHandle_); } /** * Remove the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param writeOpt WriteOptions to be used with delete operation * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. * * @deprecated Use {@link #delete(WriteOptions, byte[])} */ @Deprecated public void remove(final WriteOptions writeOpt, final byte[] key) throws RocksDBException { delete(writeOpt, key); } /** * Delete the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param writeOpt WriteOptions to be used with delete operation * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void delete(final WriteOptions writeOpt, final byte[] key) throws RocksDBException { delete(nativeHandle_, writeOpt.nativeHandle_, key, 0, key.length); } /** * Remove the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param writeOpt WriteOptions to be used with delete operation * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. * * @deprecated Use {@link #delete(ColumnFamilyHandle, WriteOptions, byte[])} */ @Deprecated public void remove(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpt, final byte[] key) throws RocksDBException { delete(columnFamilyHandle, writeOpt, key); } /** * Delete the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param writeOpt WriteOptions to be used with delete operation * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void delete(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpt, final byte[] key) throws RocksDBException { delete(nativeHandle_, writeOpt.nativeHandle_, key, 0, key.length, columnFamilyHandle.nativeHandle_); } /** * Remove the database entry for {@code key}. Requires that the key exists * and was not overwritten. It is not an error if the key did not exist * in the database. * * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple * times), then the result of calling SingleDelete() on this key is undefined. * SingleDelete() only behaves correctly if there has been only one Put() * for this key since the previous call to SingleDelete() for this key. * * This feature is currently an experimental performance optimization * for a very specific workload. It is up to the caller to ensure that * SingleDelete is only used for a key that is not deleted using Delete() or * written using Merge(). Mixing SingleDelete operations with Deletes and * Merges can result in undefined behavior. * * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. */ @Experimental("Performance optimization for a very specific workload") public void singleDelete(final byte[] key) throws RocksDBException { singleDelete(nativeHandle_, key, key.length); } /** * Remove the database entry for {@code key}. Requires that the key exists * and was not overwritten. It is not an error if the key did not exist * in the database. * * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple * times), then the result of calling SingleDelete() on this key is undefined. * SingleDelete() only behaves correctly if there has been only one Put() * for this key since the previous call to SingleDelete() for this key. * * This feature is currently an experimental performance optimization * for a very specific workload. It is up to the caller to ensure that * SingleDelete is only used for a key that is not deleted using Delete() or * written using Merge(). Mixing SingleDelete operations with Deletes and * Merges can result in undefined behavior. * * @param columnFamilyHandle The column family to delete the key from * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. */ @Experimental("Performance optimization for a very specific workload") public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) throws RocksDBException { singleDelete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); } /** * Remove the database entry for {@code key}. Requires that the key exists * and was not overwritten. It is not an error if the key did not exist * in the database. * * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple * times), then the result of calling SingleDelete() on this key is undefined. * SingleDelete() only behaves correctly if there has been only one Put() * for this key since the previous call to SingleDelete() for this key. * * This feature is currently an experimental performance optimization * for a very specific workload. It is up to the caller to ensure that * SingleDelete is only used for a key that is not deleted using Delete() or * written using Merge(). Mixing SingleDelete operations with Deletes and * Merges can result in undefined behavior. * * Note: consider setting {@link WriteOptions#setSync(boolean)} true. * * @param writeOpt Write options for the delete * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. */ @Experimental("Performance optimization for a very specific workload") public void singleDelete(final WriteOptions writeOpt, final byte[] key) throws RocksDBException { singleDelete(nativeHandle_, writeOpt.nativeHandle_, key, key.length); } /** * Remove the database entry for {@code key}. Requires that the key exists * and was not overwritten. It is not an error if the key did not exist * in the database. * * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple * times), then the result of calling SingleDelete() on this key is undefined. * SingleDelete() only behaves correctly if there has been only one Put() * for this key since the previous call to SingleDelete() for this key. * * This feature is currently an experimental performance optimization * for a very specific workload. It is up to the caller to ensure that * SingleDelete is only used for a key that is not deleted using Delete() or * written using Merge(). Mixing SingleDelete operations with Deletes and * Merges can result in undefined behavior. * * Note: consider setting {@link WriteOptions#setSync(boolean)} true. * * @param columnFamilyHandle The column family to delete the key from * @param writeOpt Write options for the delete * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. */ @Experimental("Performance optimization for a very specific workload") public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpt, final byte[] key) throws RocksDBException { singleDelete(nativeHandle_, writeOpt.nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); } /** * DB implements can export properties about their state * via this method on a per column family level. * *

If {@code property} is a valid property understood by this DB * implementation, fills {@code value} with its current value and * returns true. Otherwise returns false.

* *

Valid property names include: *

    *
  • "rocksdb.num-files-at-level<N>" - return the number of files at * level <N>, where <N> is an ASCII representation of a level * number (e.g. "0").
  • *
  • "rocksdb.stats" - returns a multi-line string that describes statistics * about the internal operation of the DB.
  • *
  • "rocksdb.sstables" - returns a multi-line string that describes all * of the sstables that make up the db contents.
  • *
* * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param property to be fetched. See above for examples * @return property value * * @throws RocksDBException thrown if error happens in underlying * native library. */ public String getProperty(final ColumnFamilyHandle columnFamilyHandle, final String property) throws RocksDBException { return getProperty0(nativeHandle_, columnFamilyHandle.nativeHandle_, property, property.length()); } /** * Removes the database entries in the range ["beginKey", "endKey"), i.e., * including "beginKey" and excluding "endKey". a non-OK status on error. It * is not an error if no keys exist in the range ["beginKey", "endKey"). * * Delete the database entry (if any) for "key". Returns OK on success, and a * non-OK status on error. It is not an error if "key" did not exist in the * database. * * @param beginKey * First key to delete within database (included) * @param endKey * Last key to delete within database (excluded) * * @throws RocksDBException * thrown if error happens in underlying native library. */ public void deleteRange(final byte[] beginKey, final byte[] endKey) throws RocksDBException { deleteRange(nativeHandle_, beginKey, 0, beginKey.length, endKey, 0, endKey.length); } /** * Removes the database entries in the range ["beginKey", "endKey"), i.e., * including "beginKey" and excluding "endKey". a non-OK status on error. It * is not an error if no keys exist in the range ["beginKey", "endKey"). * * Delete the database entry (if any) for "key". Returns OK on success, and a * non-OK status on error. It is not an error if "key" did not exist in the * database. * * @param columnFamilyHandle * {@link org.rocksdb.ColumnFamilyHandle} instance * @param beginKey * First key to delete within database (included) * @param endKey * Last key to delete within database (excluded) * * @throws RocksDBException * thrown if error happens in underlying native library. */ public void deleteRange(final ColumnFamilyHandle columnFamilyHandle, final byte[] beginKey, final byte[] endKey) throws RocksDBException { deleteRange(nativeHandle_, beginKey, 0, beginKey.length, endKey, 0, endKey.length, columnFamilyHandle.nativeHandle_); } /** * Removes the database entries in the range ["beginKey", "endKey"), i.e., * including "beginKey" and excluding "endKey". a non-OK status on error. It * is not an error if no keys exist in the range ["beginKey", "endKey"). * * Delete the database entry (if any) for "key". Returns OK on success, and a * non-OK status on error. It is not an error if "key" did not exist in the * database. * * @param writeOpt * WriteOptions to be used with delete operation * @param beginKey * First key to delete within database (included) * @param endKey * Last key to delete within database (excluded) * * @throws RocksDBException * thrown if error happens in underlying native library. */ public void deleteRange(final WriteOptions writeOpt, final byte[] beginKey, final byte[] endKey) throws RocksDBException { deleteRange(nativeHandle_, writeOpt.nativeHandle_, beginKey, 0, beginKey.length, endKey, 0, endKey.length); } /** * Removes the database entries in the range ["beginKey", "endKey"), i.e., * including "beginKey" and excluding "endKey". a non-OK status on error. It * is not an error if no keys exist in the range ["beginKey", "endKey"). * * Delete the database entry (if any) for "key". Returns OK on success, and a * non-OK status on error. It is not an error if "key" did not exist in the * database. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param writeOpt * WriteOptions to be used with delete operation * @param beginKey * First key to delete within database (included) * @param endKey * Last key to delete within database (excluded) * * @throws RocksDBException * thrown if error happens in underlying native library. */ public void deleteRange(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpt, final byte[] beginKey, final byte[] endKey) throws RocksDBException { deleteRange(nativeHandle_, writeOpt.nativeHandle_, beginKey, 0, beginKey.length, endKey, 0, endKey.length, columnFamilyHandle.nativeHandle_); } /** * DB implementations can export properties about their state * via this method. If "property" is a valid property understood by this * DB implementation, fills "*value" with its current value and returns * true. Otherwise returns false. * *

Valid property names include: *

    *
  • "rocksdb.num-files-at-level<N>" - return the number of files at * level <N>, where <N> is an ASCII representation of a level * number (e.g. "0").
  • *
  • "rocksdb.stats" - returns a multi-line string that describes statistics * about the internal operation of the DB.
  • *
  • "rocksdb.sstables" - returns a multi-line string that describes all * of the sstables that make up the db contents.
  • *
* * @param property to be fetched. See above for examples * @return property value * * @throws RocksDBException thrown if error happens in underlying * native library. */ public String getProperty(final String property) throws RocksDBException { return getProperty0(nativeHandle_, property, property.length()); } /** *

Similar to GetProperty(), but only works for a subset of properties * whose return value is a numerical value. Return the value as long.

* *

Note: As the returned property is of type * {@code uint64_t} on C++ side the returning value can be negative * because Java supports in Java 7 only signed long values.

* *

Java 7: To mitigate the problem of the non * existent unsigned long tpye, values should be encapsulated using * {@link java.math.BigInteger} to reflect the correct value. The correct * behavior is guaranteed if {@code 2^64} is added to negative values.

* *

Java 8: In Java 8 the value should be treated as * unsigned long using provided methods of type {@link Long}.

* * @param property to be fetched. * * @return numerical property value. * * @throws RocksDBException if an error happens in the underlying native code. */ public long getLongProperty(final String property) throws RocksDBException { return getLongProperty(nativeHandle_, property, property.length()); } /** *

Similar to GetProperty(), but only works for a subset of properties * whose return value is a numerical value. Return the value as long.

* *

Note: As the returned property is of type * {@code uint64_t} on C++ side the returning value can be negative * because Java supports in Java 7 only signed long values.

* *

Java 7: To mitigate the problem of the non * existent unsigned long tpye, values should be encapsulated using * {@link java.math.BigInteger} to reflect the correct value. The correct * behavior is guaranteed if {@code 2^64} is added to negative values.

* *

Java 8: In Java 8 the value should be treated as * unsigned long using provided methods of type {@link Long}.

* * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param property to be fetched. * * @return numerical property value * * @throws RocksDBException if an error happens in the underlying native code. */ public long getLongProperty(final ColumnFamilyHandle columnFamilyHandle, final String property) throws RocksDBException { return getLongProperty(nativeHandle_, columnFamilyHandle.nativeHandle_, property, property.length()); } /** *

Return sum of the getLongProperty of all the column families

* *

Note: As the returned property is of type * {@code uint64_t} on C++ side the returning value can be negative * because Java supports in Java 7 only signed long values.

* *

Java 7: To mitigate the problem of the non * existent unsigned long tpye, values should be encapsulated using * {@link java.math.BigInteger} to reflect the correct value. The correct * behavior is guaranteed if {@code 2^64} is added to negative values.

* *

Java 8: In Java 8 the value should be treated as * unsigned long using provided methods of type {@link Long}.

* * @param property to be fetched. * * @return numerical property value * * @throws RocksDBException if an error happens in the underlying native code. */ public long getAggregatedLongProperty(final String property) throws RocksDBException { return getAggregatedLongProperty(nativeHandle_, property, property.length()); } /** *

Return a heap-allocated iterator over the contents of the * database. The result of newIterator() is initially invalid * (caller must call one of the Seek methods on the iterator * before using it).

* *

Caller should close the iterator when it is no longer needed. * The returned iterator should be closed before this db is closed. *

* * @return instance of iterator object. */ public RocksIterator newIterator() { return new RocksIterator(this, iterator(nativeHandle_)); } /** *

Return a heap-allocated iterator over the contents of the * database. The result of newIterator() is initially invalid * (caller must call one of the Seek methods on the iterator * before using it).

* *

Caller should close the iterator when it is no longer needed. * The returned iterator should be closed before this db is closed. *

* * @param readOptions {@link ReadOptions} instance. * @return instance of iterator object. */ public RocksIterator newIterator(final ReadOptions readOptions) { return new RocksIterator(this, iterator(nativeHandle_, readOptions.nativeHandle_)); } /** *

Return a handle to the current DB state. Iterators created with * this handle will all observe a stable snapshot of the current DB * state. The caller must call ReleaseSnapshot(result) when the * snapshot is no longer needed.

* *

nullptr will be returned if the DB fails to take a snapshot or does * not support snapshot.

* * @return Snapshot {@link Snapshot} instance */ public Snapshot getSnapshot() { long snapshotHandle = getSnapshot(nativeHandle_); if (snapshotHandle != 0) { return new Snapshot(snapshotHandle); } return null; } /** * Release a previously acquired snapshot. The caller must not * use "snapshot" after this call. * * @param snapshot {@link Snapshot} instance */ public void releaseSnapshot(final Snapshot snapshot) { if (snapshot != null) { releaseSnapshot(nativeHandle_, snapshot.nativeHandle_); } } /** *

Return a heap-allocated iterator over the contents of the * database. The result of newIterator() is initially invalid * (caller must call one of the Seek methods on the iterator * before using it).

* *

Caller should close the iterator when it is no longer needed. * The returned iterator should be closed before this db is closed. *

* * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @return instance of iterator object. */ public RocksIterator newIterator( final ColumnFamilyHandle columnFamilyHandle) { return new RocksIterator(this, iteratorCF(nativeHandle_, columnFamilyHandle.nativeHandle_)); } /** *

Return a heap-allocated iterator over the contents of the * database. The result of newIterator() is initially invalid * (caller must call one of the Seek methods on the iterator * before using it).

* *

Caller should close the iterator when it is no longer needed. * The returned iterator should be closed before this db is closed. *

* * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param readOptions {@link ReadOptions} instance. * @return instance of iterator object. */ public RocksIterator newIterator(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions readOptions) { return new RocksIterator(this, iteratorCF(nativeHandle_, columnFamilyHandle.nativeHandle_, readOptions.nativeHandle_)); } /** * Returns iterators from a consistent database state across multiple * column families. Iterators are heap allocated and need to be deleted * before the db is deleted * * @param columnFamilyHandleList {@link java.util.List} containing * {@link org.rocksdb.ColumnFamilyHandle} instances. * @return {@link java.util.List} containing {@link org.rocksdb.RocksIterator} * instances * * @throws RocksDBException thrown if error happens in underlying * native library. */ public List newIterators( final List columnFamilyHandleList) throws RocksDBException { return newIterators(columnFamilyHandleList, new ReadOptions()); } /** * Returns iterators from a consistent database state across multiple * column families. Iterators are heap allocated and need to be deleted * before the db is deleted * * @param columnFamilyHandleList {@link java.util.List} containing * {@link org.rocksdb.ColumnFamilyHandle} instances. * @param readOptions {@link ReadOptions} instance. * @return {@link java.util.List} containing {@link org.rocksdb.RocksIterator} * instances * * @throws RocksDBException thrown if error happens in underlying * native library. */ public List newIterators( final List columnFamilyHandleList, final ReadOptions readOptions) throws RocksDBException { final long[] columnFamilyHandles = new long[columnFamilyHandleList.size()]; for (int i = 0; i < columnFamilyHandleList.size(); i++) { columnFamilyHandles[i] = columnFamilyHandleList.get(i).nativeHandle_; } final long[] iteratorRefs = iterators(nativeHandle_, columnFamilyHandles, readOptions.nativeHandle_); final List iterators = new ArrayList<>( columnFamilyHandleList.size()); for (int i=0; iFlush all memory table data.

* *

Note: it must be ensured that the FlushOptions instance * is not GC'ed before this method finishes. If the wait parameter is * set to false, flush processing is asynchronous.

* * @param flushOptions {@link org.rocksdb.FlushOptions} instance. * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ public void flush(final FlushOptions flushOptions) throws RocksDBException { flush(nativeHandle_, flushOptions.nativeHandle_); } /** *

Flush all memory table data.

* *

Note: it must be ensured that the FlushOptions instance * is not GC'ed before this method finishes. If the wait parameter is * set to false, flush processing is asynchronous.

* * @param flushOptions {@link org.rocksdb.FlushOptions} instance. * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance. * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ public void flush(final FlushOptions flushOptions, final ColumnFamilyHandle columnFamilyHandle) throws RocksDBException { flush(nativeHandle_, flushOptions.nativeHandle_, columnFamilyHandle.nativeHandle_); } /** *

Range compaction of database.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* *

See also

*
    *
  • {@link #compactRange(boolean, int, int)}
  • *
  • {@link #compactRange(byte[], byte[])}
  • *
  • {@link #compactRange(byte[], byte[], boolean, int, int)}
  • *
* * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ public void compactRange() throws RocksDBException { compactRange0(nativeHandle_, false, -1, 0); } /** *

Range compaction of database.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* *

See also

*
    *
  • {@link #compactRange()}
  • *
  • {@link #compactRange(boolean, int, int)}
  • *
  • {@link #compactRange(byte[], byte[], boolean, int, int)}
  • *
* * @param begin start of key range (included in range) * @param end end of key range (excluded from range) * * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ public void compactRange(final byte[] begin, final byte[] end) throws RocksDBException { compactRange0(nativeHandle_, begin, begin.length, end, end.length, false, -1, 0); } /** *

Range compaction of database.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* *

Compaction outputs should be placed in options.db_paths * [target_path_id]. Behavior is undefined if target_path_id is * out of range.

* *

See also

*
    *
  • {@link #compactRange()}
  • *
  • {@link #compactRange(byte[], byte[])}
  • *
  • {@link #compactRange(byte[], byte[], boolean, int, int)}
  • *
* * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead * * @param reduce_level reduce level after compaction * @param target_level target level to compact to * @param target_path_id the target path id of output path * * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ @Deprecated public void compactRange(final boolean reduce_level, final int target_level, final int target_path_id) throws RocksDBException { compactRange0(nativeHandle_, reduce_level, target_level, target_path_id); } /** *

Range compaction of database.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* *

Compaction outputs should be placed in options.db_paths * [target_path_id]. Behavior is undefined if target_path_id is * out of range.

* *

See also

*
    *
  • {@link #compactRange()}
  • *
  • {@link #compactRange(boolean, int, int)}
  • *
  • {@link #compactRange(byte[], byte[])}
  • *
* * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead * * @param begin start of key range (included in range) * @param end end of key range (excluded from range) * @param reduce_level reduce level after compaction * @param target_level target level to compact to * @param target_path_id the target path id of output path * * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ @Deprecated public void compactRange(final byte[] begin, final byte[] end, final boolean reduce_level, final int target_level, final int target_path_id) throws RocksDBException { compactRange0(nativeHandle_, begin, begin.length, end, end.length, reduce_level, target_level, target_path_id); } /** *

Range compaction of column family.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* *

See also

*
    *
  • * {@link #compactRange(ColumnFamilyHandle, boolean, int, int)} *
  • *
  • * {@link #compactRange(ColumnFamilyHandle, byte[], byte[])} *
  • *
  • * {@link #compactRange(ColumnFamilyHandle, byte[], byte[], * boolean, int, int)} *
  • *
* * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance. * * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ public void compactRange(final ColumnFamilyHandle columnFamilyHandle) throws RocksDBException { compactRange(nativeHandle_, false, -1, 0, columnFamilyHandle.nativeHandle_); } /** *

Range compaction of column family.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* *

See also

*
    *
  • {@link #compactRange(ColumnFamilyHandle)}
  • *
  • * {@link #compactRange(ColumnFamilyHandle, boolean, int, int)} *
  • *
  • * {@link #compactRange(ColumnFamilyHandle, byte[], byte[], * boolean, int, int)} *
  • *
* * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance. * @param begin start of key range (included in range) * @param end end of key range (excluded from range) * * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ public void compactRange(final ColumnFamilyHandle columnFamilyHandle, final byte[] begin, final byte[] end) throws RocksDBException { compactRange(nativeHandle_, begin, begin.length, end, end.length, false, -1, 0, columnFamilyHandle.nativeHandle_); } /** *

Range compaction of column family.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance. * @param begin start of key range (included in range) * @param end end of key range (excluded from range) * @param compactRangeOptions options for the compaction * * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ public void compactRange(final ColumnFamilyHandle columnFamilyHandle, final byte[] begin, final byte[] end, CompactRangeOptions compactRangeOptions) throws RocksDBException { compactRange(nativeHandle_, begin, begin.length, end, end.length, compactRangeOptions.nativeHandle_, columnFamilyHandle.nativeHandle_); } /** *

Range compaction of column family.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* *

Compaction outputs should be placed in options.db_paths * [target_path_id]. Behavior is undefined if target_path_id is * out of range.

* *

See also

*
    *
  • {@link #compactRange(ColumnFamilyHandle)}
  • *
  • * {@link #compactRange(ColumnFamilyHandle, byte[], byte[])} *
  • *
  • * {@link #compactRange(ColumnFamilyHandle, byte[], byte[], * boolean, int, int)} *
  • *
* * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance. * @param reduce_level reduce level after compaction * @param target_level target level to compact to * @param target_path_id the target path id of output path * * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ @Deprecated public void compactRange(final ColumnFamilyHandle columnFamilyHandle, final boolean reduce_level, final int target_level, final int target_path_id) throws RocksDBException { compactRange(nativeHandle_, reduce_level, target_level, target_path_id, columnFamilyHandle.nativeHandle_); } /** *

Range compaction of column family.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* *

Compaction outputs should be placed in options.db_paths * [target_path_id]. Behavior is undefined if target_path_id is * out of range.

* *

See also

*
    *
  • {@link #compactRange(ColumnFamilyHandle)}
  • *
  • * {@link #compactRange(ColumnFamilyHandle, boolean, int, int)} *
  • *
  • * {@link #compactRange(ColumnFamilyHandle, byte[], byte[])} *
  • *
* * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance. * @param begin start of key range (included in range) * @param end end of key range (excluded from range) * @param reduce_level reduce level after compaction * @param target_level target level to compact to * @param target_path_id the target path id of output path * * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ @Deprecated public void compactRange(final ColumnFamilyHandle columnFamilyHandle, final byte[] begin, final byte[] end, final boolean reduce_level, final int target_level, final int target_path_id) throws RocksDBException { compactRange(nativeHandle_, begin, begin.length, end, end.length, reduce_level, target_level, target_path_id, columnFamilyHandle.nativeHandle_); } /** * This function will wait until all currently running background processes * finish. After it returns, no background process will be run until * {@link #continueBackgroundWork()} is called * * @throws RocksDBException If an error occurs when pausing background work */ public void pauseBackgroundWork() throws RocksDBException { pauseBackgroundWork(nativeHandle_); } /** * Resumes backround work which was suspended by * previously calling {@link #pauseBackgroundWork()} * * @throws RocksDBException If an error occurs when resuming background work */ public void continueBackgroundWork() throws RocksDBException { continueBackgroundWork(nativeHandle_); } /** *

The sequence number of the most recent transaction.

* * @return sequence number of the most * recent transaction. */ public long getLatestSequenceNumber() { return getLatestSequenceNumber(nativeHandle_); } /** *

Prevent file deletions. Compactions will continue to occur, * but no obsolete files will be deleted. Calling this multiple * times have the same effect as calling it once.

* * @throws RocksDBException thrown if operation was not performed * successfully. */ public void disableFileDeletions() throws RocksDBException { disableFileDeletions(nativeHandle_); } /** *

Allow compactions to delete obsolete files. * If force == true, the call to EnableFileDeletions() * will guarantee that file deletions are enabled after * the call, even if DisableFileDeletions() was called * multiple times before.

* *

If force == false, EnableFileDeletions will only * enable file deletion after it's been called at least * as many times as DisableFileDeletions(), enabling * the two methods to be called by two threads * concurrently without synchronization * -- i.e., file deletions will be enabled only after both * threads call EnableFileDeletions()

* * @param force boolean value described above. * * @throws RocksDBException thrown if operation was not performed * successfully. */ public void enableFileDeletions(final boolean force) throws RocksDBException { enableFileDeletions(nativeHandle_, force); } /** *

Returns an iterator that is positioned at a write-batch containing * seq_number. If the sequence number is non existent, it returns an iterator * at the first available seq_no after the requested seq_no.

* *

Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to * use this api, else the WAL files will get * cleared aggressively and the iterator might keep getting invalid before * an update is read.

* * @param sequenceNumber sequence number offset * * @return {@link org.rocksdb.TransactionLogIterator} instance. * * @throws org.rocksdb.RocksDBException if iterator cannot be retrieved * from native-side. */ public TransactionLogIterator getUpdatesSince(final long sequenceNumber) throws RocksDBException { return new TransactionLogIterator( getUpdatesSince(nativeHandle_, sequenceNumber)); } public void setOptions(final ColumnFamilyHandle columnFamilyHandle, final MutableColumnFamilyOptions mutableColumnFamilyOptions) throws RocksDBException { setOptions(nativeHandle_, columnFamilyHandle.nativeHandle_, mutableColumnFamilyOptions.getKeys(), mutableColumnFamilyOptions.getValues()); } private long[] toNativeHandleList(final List objectList) { final int len = objectList.size(); final long[] handleList = new long[len]; for (int i = 0; i < len; i++) { handleList[i] = objectList.get(i).nativeHandle_; } return handleList; } /** * ingestExternalFile will load a list of external SST files (1) into the DB * We will try to find the lowest possible level that the file can fit in, and * ingest the file into this level (2). A file that have a key range that * overlap with the memtable key range will require us to Flush the memtable * first before ingesting the file. * * (1) External SST files can be created using {@link SstFileWriter} * (2) We will try to ingest the files to the lowest possible level * even if the file compression doesn't match the level compression * * @param filePathList The list of files to ingest * @param ingestExternalFileOptions the options for the ingestion * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void ingestExternalFile(final List filePathList, final IngestExternalFileOptions ingestExternalFileOptions) throws RocksDBException { ingestExternalFile(nativeHandle_, getDefaultColumnFamily().nativeHandle_, filePathList.toArray(new String[filePathList.size()]), filePathList.size(), ingestExternalFileOptions.nativeHandle_); } /** * ingestExternalFile will load a list of external SST files (1) into the DB * We will try to find the lowest possible level that the file can fit in, and * ingest the file into this level (2). A file that have a key range that * overlap with the memtable key range will require us to Flush the memtable * first before ingesting the file. * * (1) External SST files can be created using {@link SstFileWriter} * (2) We will try to ingest the files to the lowest possible level * even if the file compression doesn't match the level compression * * @param columnFamilyHandle The column family for the ingested files * @param filePathList The list of files to ingest * @param ingestExternalFileOptions the options for the ingestion * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void ingestExternalFile(final ColumnFamilyHandle columnFamilyHandle, final List filePathList, final IngestExternalFileOptions ingestExternalFileOptions) throws RocksDBException { ingestExternalFile(nativeHandle_, columnFamilyHandle.nativeHandle_, filePathList.toArray(new String[filePathList.size()]), filePathList.size(), ingestExternalFileOptions.nativeHandle_); } /** * Static method to destroy the contents of the specified database. * Be very careful using this method. * * @param path the path to the Rocksdb database. * @param options {@link org.rocksdb.Options} instance. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public static void destroyDB(final String path, final Options options) throws RocksDBException { destroyDB(path, options.nativeHandle_); } /** * Private constructor. * * @param nativeHandle The native handle of the C++ RocksDB object */ protected RocksDB(final long nativeHandle) { super(nativeHandle); } // native methods protected native static long open(final long optionsHandle, final String path) throws RocksDBException; /** * @param optionsHandle Native handle pointing to an Options object * @param path The directory path for the database files * @param columnFamilyNames An array of column family names * @param columnFamilyOptions An array of native handles pointing to * ColumnFamilyOptions objects * * @return An array of native handles, [0] is the handle of the RocksDB object * [1..1+n] are handles of the ColumnFamilyReferences * * @throws RocksDBException thrown if the database could not be opened */ protected native static long[] open(final long optionsHandle, final String path, final byte[][] columnFamilyNames, final long[] columnFamilyOptions) throws RocksDBException; protected native static long openROnly(final long optionsHandle, final String path) throws RocksDBException; /** * @param optionsHandle Native handle pointing to an Options object * @param path The directory path for the database files * @param columnFamilyNames An array of column family names * @param columnFamilyOptions An array of native handles pointing to * ColumnFamilyOptions objects * * @return An array of native handles, [0] is the handle of the RocksDB object * [1..1+n] are handles of the ColumnFamilyReferences * * @throws RocksDBException thrown if the database could not be opened */ protected native static long[] openROnly(final long optionsHandle, final String path, final byte[][] columnFamilyNames, final long[] columnFamilyOptions ) throws RocksDBException; protected native static byte[][] listColumnFamilies(long optionsHandle, String path) throws RocksDBException; protected native void put(long handle, byte[] key, int keyOffset, int keyLength, byte[] value, int valueOffset, int valueLength) throws RocksDBException; protected native void put(long handle, byte[] key, int keyOffset, int keyLength, byte[] value, int valueOffset, int valueLength, long cfHandle) throws RocksDBException; protected native void put(long handle, long writeOptHandle, byte[] key, int keyOffset, int keyLength, byte[] value, int valueOffset, int valueLength) throws RocksDBException; protected native void put(long handle, long writeOptHandle, byte[] key, int keyOffset, int keyLength, byte[] value, int valueOffset, int valueLength, long cfHandle) throws RocksDBException; protected native void write0(final long handle, long writeOptHandle, long wbHandle) throws RocksDBException; protected native void write1(final long handle, long writeOptHandle, long wbwiHandle) throws RocksDBException; protected native boolean keyMayExist(final long handle, final byte[] key, final int keyOffset, final int keyLength, final StringBuilder stringBuilder); protected native boolean keyMayExist(final long handle, final byte[] key, final int keyOffset, final int keyLength, final long cfHandle, final StringBuilder stringBuilder); protected native boolean keyMayExist(final long handle, final long optionsHandle, final byte[] key, final int keyOffset, final int keyLength, final StringBuilder stringBuilder); protected native boolean keyMayExist(final long handle, final long optionsHandle, final byte[] key, final int keyOffset, final int keyLength, final long cfHandle, final StringBuilder stringBuilder); protected native void merge(long handle, byte[] key, int keyOffset, int keyLength, byte[] value, int valueOffset, int valueLength) throws RocksDBException; protected native void merge(long handle, byte[] key, int keyOffset, int keyLength, byte[] value, int valueOffset, int valueLength, long cfHandle) throws RocksDBException; protected native void merge(long handle, long writeOptHandle, byte[] key, int keyOffset, int keyLength, byte[] value, int valueOffset, int valueLength) throws RocksDBException; protected native void merge(long handle, long writeOptHandle, byte[] key, int keyOffset, int keyLength, byte[] value, int valueOffset, int valueLength, long cfHandle) throws RocksDBException; protected native int get(long handle, byte[] key, int keyOffset, int keyLength, byte[] value, int valueOffset, int valueLength) throws RocksDBException; protected native int get(long handle, byte[] key, int keyOffset, int keyLength, byte[] value, int valueOffset, int valueLength, long cfHandle) throws RocksDBException; protected native int get(long handle, long readOptHandle, byte[] key, int keyOffset, int keyLength, byte[] value, int valueOffset, int valueLength) throws RocksDBException; protected native int get(long handle, long readOptHandle, byte[] key, int keyOffset, int keyLength, byte[] value, int valueOffset, int valueLength, long cfHandle) throws RocksDBException; protected native byte[][] multiGet(final long dbHandle, final byte[][] keys, final int[] keyOffsets, final int[] keyLengths); protected native byte[][] multiGet(final long dbHandle, final byte[][] keys, final int[] keyOffsets, final int[] keyLengths, final long[] columnFamilyHandles); protected native byte[][] multiGet(final long dbHandle, final long rOptHandle, final byte[][] keys, final int[] keyOffsets, final int[] keyLengths); protected native byte[][] multiGet(final long dbHandle, final long rOptHandle, final byte[][] keys, final int[] keyOffsets, final int[] keyLengths, final long[] columnFamilyHandles); protected native byte[] get(long handle, byte[] key, int keyOffset, int keyLength) throws RocksDBException; protected native byte[] get(long handle, byte[] key, int keyOffset, int keyLength, long cfHandle) throws RocksDBException; protected native byte[] get(long handle, long readOptHandle, byte[] key, int keyOffset, int keyLength) throws RocksDBException; protected native byte[] get(long handle, long readOptHandle, byte[] key, int keyOffset, int keyLength, long cfHandle) throws RocksDBException; protected native void delete(long handle, byte[] key, int keyOffset, int keyLength) throws RocksDBException; protected native void delete(long handle, byte[] key, int keyOffset, int keyLength, long cfHandle) throws RocksDBException; protected native void delete(long handle, long writeOptHandle, byte[] key, int keyOffset, int keyLength) throws RocksDBException; protected native void delete(long handle, long writeOptHandle, byte[] key, int keyOffset, int keyLength, long cfHandle) throws RocksDBException; protected native void singleDelete( long handle, byte[] key, int keyLen) throws RocksDBException; protected native void singleDelete( long handle, byte[] key, int keyLen, long cfHandle) throws RocksDBException; protected native void singleDelete( long handle, long writeOptHandle, byte[] key, int keyLen) throws RocksDBException; protected native void singleDelete( long handle, long writeOptHandle, byte[] key, int keyLen, long cfHandle) throws RocksDBException; protected native void deleteRange(long handle, byte[] beginKey, int beginKeyOffset, int beginKeyLength, byte[] endKey, int endKeyOffset, int endKeyLength) throws RocksDBException; protected native void deleteRange(long handle, byte[] beginKey, int beginKeyOffset, int beginKeyLength, byte[] endKey, int endKeyOffset, int endKeyLength, long cfHandle) throws RocksDBException; protected native void deleteRange(long handle, long writeOptHandle, byte[] beginKey, int beginKeyOffset, int beginKeyLength, byte[] endKey, int endKeyOffset, int endKeyLength) throws RocksDBException; protected native void deleteRange(long handle, long writeOptHandle, byte[] beginKey, int beginKeyOffset, int beginKeyLength, byte[] endKey, int endKeyOffset, int endKeyLength, long cfHandle) throws RocksDBException; protected native String getProperty0(long nativeHandle, String property, int propertyLength) throws RocksDBException; protected native String getProperty0(long nativeHandle, long cfHandle, String property, int propertyLength) throws RocksDBException; protected native long getLongProperty(long nativeHandle, String property, int propertyLength) throws RocksDBException; protected native long getLongProperty(long nativeHandle, long cfHandle, String property, int propertyLength) throws RocksDBException; protected native long getAggregatedLongProperty(long nativeHandle, String property, int propertyLength) throws RocksDBException; protected native long iterator(long handle); protected native long iterator(long handle, long readOptHandle); protected native long iteratorCF(long handle, long cfHandle); protected native long iteratorCF(long handle, long cfHandle, long readOptHandle); protected native long[] iterators(final long handle, final long[] columnFamilyHandles, final long readOptHandle) throws RocksDBException; protected native long getSnapshot(long nativeHandle); protected native void releaseSnapshot( long nativeHandle, long snapshotHandle); @Override protected native void disposeInternal(final long handle); private native long getDefaultColumnFamily(long handle); private native long createColumnFamily(final long handle, final byte[] columnFamilyName, final long columnFamilyOptions) throws RocksDBException; private native void dropColumnFamily(long handle, long cfHandle) throws RocksDBException; private native void flush(long handle, long flushOptHandle) throws RocksDBException; private native void flush(long handle, long flushOptHandle, long cfHandle) throws RocksDBException; private native void compactRange0(long handle, boolean reduce_level, int target_level, int target_path_id) throws RocksDBException; private native void compactRange0(long handle, byte[] begin, int beginLen, byte[] end, int endLen, boolean reduce_level, int target_level, int target_path_id) throws RocksDBException; private native void compactRange(long handle, byte[] begin, int beginLen, byte[] end, int endLen, long compactRangeOptHandle, long cfHandle) throws RocksDBException; private native void compactRange(long handle, boolean reduce_level, int target_level, int target_path_id, long cfHandle) throws RocksDBException; private native void compactRange(long handle, byte[] begin, int beginLen, byte[] end, int endLen, boolean reduce_level, int target_level, int target_path_id, long cfHandle) throws RocksDBException; private native void pauseBackgroundWork(long handle) throws RocksDBException; private native void continueBackgroundWork(long handle) throws RocksDBException; private native long getLatestSequenceNumber(long handle); private native void disableFileDeletions(long handle) throws RocksDBException; private native void enableFileDeletions(long handle, boolean force) throws RocksDBException; private native long getUpdatesSince(long handle, long sequenceNumber) throws RocksDBException; private native void setOptions(long handle, long cfHandle, String[] keys, String[] values) throws RocksDBException; private native void ingestExternalFile(long handle, long cfHandle, String[] filePathList, int filePathListLen, long ingest_external_file_options_handle) throws RocksDBException; private native static void destroyDB(final String path, final long optionsHandle) throws RocksDBException; protected DBOptionsInterface options_; } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.13/org/apache/flink/client/deployment/application/ApplicationDispatcherBootstrap.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.client.deployment.application; import org.apache.flink.annotation.Internal; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.JobID; import org.apache.flink.api.common.time.Time; import org.apache.flink.client.ClientUtils; import org.apache.flink.client.cli.ClientOptions; import org.apache.flink.client.deployment.application.executors.EmbeddedExecutor; import org.apache.flink.client.deployment.application.executors.EmbeddedExecutorServiceLoader; import org.apache.flink.client.program.PackagedProgram; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.HighAvailabilityOptions; import org.apache.flink.configuration.PipelineOptionsInternal; import org.apache.flink.core.execution.PipelineExecutorServiceLoader; import org.apache.flink.runtime.clusterframework.ApplicationStatus; import org.apache.flink.runtime.concurrent.FutureUtils; import org.apache.flink.runtime.concurrent.ScheduledExecutor; import org.apache.flink.runtime.dispatcher.DispatcherBootstrap; import org.apache.flink.runtime.dispatcher.DispatcherGateway; import org.apache.flink.runtime.jobgraph.JobGraph; import org.apache.flink.runtime.jobmanager.HighAvailabilityMode; import org.apache.flink.runtime.jobmaster.JobResult; import org.apache.flink.runtime.messages.Acknowledge; import org.apache.flink.runtime.rpc.FatalErrorHandler; import org.apache.flink.util.ExceptionUtils; import org.apache.flink.util.FlinkException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletionException; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; import java.util.function.Function; import java.util.stream.Collectors; import static org.apache.flink.util.Preconditions.checkNotNull; /** * A {@link DispatcherBootstrap} used for running the user's {@code main()} in "Application Mode" * (see FLIP-85). * *

This dispatcher bootstrap submits the recovered {@link JobGraph job graphs} for re-execution * (in case of recovery from a failure), and then submits the remaining jobs of the application for * execution. * *

To achieve this, it works in conjunction with the {@link EmbeddedExecutor EmbeddedExecutor} * which decides if it should submit a job for execution (in case of a new job) or the job was * already recovered and is running. */ @Internal public class ApplicationDispatcherBootstrap implements DispatcherBootstrap { private static final Logger LOG = LoggerFactory.getLogger(ApplicationDispatcherBootstrap.class); public static final JobID ZERO_JOB_ID = new JobID(0, 0); private final PackagedProgram application; private final Collection recoveredJobIds; private final Configuration configuration; private final FatalErrorHandler errorHandler; private final CompletableFuture applicationCompletionFuture; private final CompletableFuture clusterShutdownFuture; private ScheduledFuture applicationExecutionTask; public ApplicationDispatcherBootstrap( final PackagedProgram application, final Collection recoveredJobIds, final Configuration configuration, final DispatcherGateway dispatcherGateway, final ScheduledExecutor scheduledExecutor, final FatalErrorHandler errorHandler) { this.configuration = checkNotNull(configuration); this.recoveredJobIds = checkNotNull(recoveredJobIds); this.application = checkNotNull(application); this.errorHandler = checkNotNull(errorHandler); this.applicationCompletionFuture = fixJobIdAndRunApplicationAsync(dispatcherGateway, scheduledExecutor); this.clusterShutdownFuture = runApplicationAndShutdownClusterAsync(dispatcherGateway); } @Override public void stop() { if (applicationExecutionTask != null) { applicationExecutionTask.cancel(true); } if (applicationCompletionFuture != null) { applicationCompletionFuture.cancel(true); } } @VisibleForTesting ScheduledFuture getApplicationExecutionFuture() { return applicationExecutionTask; } @VisibleForTesting CompletableFuture getApplicationCompletionFuture() { return applicationCompletionFuture; } @VisibleForTesting CompletableFuture getClusterShutdownFuture() { return clusterShutdownFuture; } /** * Runs the user program entrypoint and shuts down the given dispatcherGateway when the * application completes (either successfully or in case of failure). */ private CompletableFuture runApplicationAndShutdownClusterAsync( final DispatcherGateway dispatcherGateway) { return applicationCompletionFuture .handle( (r, t) -> { if (t == null) { LOG.info("Application completed SUCCESSFULLY"); return dispatcherGateway.shutDownCluster( ApplicationStatus.SUCCEEDED); } final Optional exception = ExceptionUtils.findThrowable( t, UnsuccessfulExecutionException.class); if (exception.isPresent()) { final ApplicationStatus applicationStatus = exception.get().getStatus(); if (applicationStatus == ApplicationStatus.CANCELED || applicationStatus == ApplicationStatus.FAILED) { LOG.info("Application {}: ", applicationStatus, t); return dispatcherGateway.shutDownCluster(applicationStatus); } } LOG.warn("Application failed unexpectedly: ", t); this.errorHandler.onFatalError( new FlinkException("Application failed unexpectedly.", t)); return FutureUtils.completedExceptionally(t); }) .thenCompose(Function.identity()); } private CompletableFuture fixJobIdAndRunApplicationAsync( final DispatcherGateway dispatcherGateway, final ScheduledExecutor scheduledExecutor) { final Optional configuredJobId = configuration.getOptional(PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID); if (!HighAvailabilityMode.isHighAvailabilityModeActivated(configuration) && !configuredJobId.isPresent()) { return runApplicationAsync(dispatcherGateway, scheduledExecutor, false); } // TODO: ------------ start:二次开发代码 --------------- // if (!configuredJobId.isPresent()) { String haClusterId = configuration.getString(HighAvailabilityOptions.HA_CLUSTER_ID); String[] splits = (haClusterId != null ? haClusterId : "").split("_"); if (splits != null && splits.length == 3) { configuration.set( PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID, new JobID(Long.valueOf(splits[1]), Long.valueOf(splits[2])).toHexString()); } else { configuration.set( PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID, ZERO_JOB_ID.toHexString()); } } // TODO: ------------ end:二次开发代码 --------------- // return runApplicationAsync(dispatcherGateway, scheduledExecutor, true); } /** * Runs the user program entrypoint by scheduling a task on the given {@code scheduledExecutor}. * The returned {@link CompletableFuture} completes when all jobs of the user application * succeeded. if any of them fails, or if job submission fails. */ private CompletableFuture runApplicationAsync( final DispatcherGateway dispatcherGateway, final ScheduledExecutor scheduledExecutor, final boolean enforceSingleJobExecution) { final CompletableFuture> applicationExecutionFuture = new CompletableFuture<>(); // we need to hand in a future as return value because we need to get those JobIs out // from the scheduled task that executes the user program applicationExecutionTask = scheduledExecutor.schedule( () -> runApplicationEntryPoint( applicationExecutionFuture, dispatcherGateway, scheduledExecutor, enforceSingleJobExecution), 0L, TimeUnit.MILLISECONDS); return applicationExecutionFuture.thenCompose( jobIds -> getApplicationResult(dispatcherGateway, jobIds, scheduledExecutor)); } /** * Runs the user program entrypoint and completes the given {@code jobIdsFuture} with the {@link * JobID JobIDs} of the submitted jobs. * *

This should be executed in a separate thread (or task). */ private void runApplicationEntryPoint( final CompletableFuture> jobIdsFuture, final DispatcherGateway dispatcherGateway, final ScheduledExecutor scheduledExecutor, final boolean enforceSingleJobExecution) { try { final List applicationJobIds = new ArrayList<>(recoveredJobIds); final PipelineExecutorServiceLoader executorServiceLoader = new EmbeddedExecutorServiceLoader( applicationJobIds, dispatcherGateway, scheduledExecutor); ClientUtils.executeProgram( executorServiceLoader, configuration, application, enforceSingleJobExecution, true /* suppress sysout */); if (applicationJobIds.isEmpty()) { jobIdsFuture.completeExceptionally( new ApplicationExecutionException( "The application contains no execute() calls.")); } else { jobIdsFuture.complete(applicationJobIds); } } catch (Throwable t) { // TODO: ------------ start:二次开发代码 --------------- // ExceptionUtils.stringifyException(t); // TODO: ------------ end:二次开发代码 --------------- // jobIdsFuture.completeExceptionally( new ApplicationExecutionException("Could not execute application.", t)); } } private CompletableFuture getApplicationResult( final DispatcherGateway dispatcherGateway, final Collection applicationJobIds, final ScheduledExecutor executor) { final List> jobResultFutures = applicationJobIds.stream() .map( jobId -> unwrapJobResultException( getJobResult(dispatcherGateway, jobId, executor))) .collect(Collectors.toList()); return FutureUtils.waitForAll(jobResultFutures); } private CompletableFuture getJobResult( final DispatcherGateway dispatcherGateway, final JobID jobId, final ScheduledExecutor scheduledExecutor) { final Time timeout = Time.milliseconds(configuration.get(ClientOptions.CLIENT_TIMEOUT).toMillis()); final Time retryPeriod = Time.milliseconds(configuration.get(ClientOptions.CLIENT_RETRY_PERIOD).toMillis()); return JobStatusPollingUtils.getJobResult( dispatcherGateway, jobId, scheduledExecutor, timeout, retryPeriod); } /** * If the given {@link JobResult} indicates success, this passes through the {@link JobResult}. * Otherwise, this returns a future that is finished exceptionally (potentially with an * exception from the {@link JobResult}. */ private CompletableFuture unwrapJobResultException( final CompletableFuture jobResult) { return jobResult.thenApply( result -> { if (result.isSuccess()) { return result; } throw new CompletionException( UnsuccessfulExecutionException.fromJobResult( result, application.getUserCodeClassLoader())); }); } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.13/org/apache/flink/configuration/GlobalConfiguration.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.configuration; import com.zto.fire.common.conf.FireFrameworkConf; import com.zto.fire.common.util.OSUtils; import com.zto.fire.common.util.PropUtils; import org.apache.flink.annotation.Internal; import org.apache.flink.util.Preconditions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.collection.JavaConversions; import javax.annotation.Nullable; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.lang.reflect.Method; import java.net.ServerSocket; import java.util.HashMap; import java.util.Map; import java.util.concurrent.atomic.AtomicBoolean; /** * Global configuration object for Flink. Similar to Java properties configuration objects it * includes key-value pairs which represent the framework's configuration. */ @Internal public final class GlobalConfiguration { private static final Logger LOG = LoggerFactory.getLogger(GlobalConfiguration.class); private static AtomicBoolean isStart = new AtomicBoolean(false); public static final String FLINK_CONF_FILENAME = "flink-conf.yaml"; // the keys whose values should be hidden private static final String[] SENSITIVE_KEYS = new String[] {"password", "secret", "fs.azure.account.key", "apikey"}; // the hidden content to be displayed public static final String HIDDEN_CONTENT = "******"; // TODO: ------------ start:二次开发代码 --------------- // // 用于判断是JobManager还是TaskManager private static boolean isJobManager = false; // fire rest服务占用端口 private static ServerSocket restServerSocket; // 任务的运行模式 private static String runMode; private static final Map settings = new HashMap<>(); static { try { restServerSocket = new ServerSocket(0); } catch (Exception e) { LOG.error("创建Socket失败", e); } } /** * 获取配置信息 */ public static Map getSettings() { return settings; } /** * 获取随机分配的Rest端口号 */ public static int getRestPort() { return restServerSocket.getLocalPort(); } /** * 获取rest服务端口号,并关闭Socket */ public static int getRestPortAndClose() { int port = restServerSocket.getLocalPort(); if (restServerSocket != null && !restServerSocket.isClosed()) { try { restServerSocket.close(); } catch (Exception e) { LOG.error("关闭Rest Socket失败", e); } } return port; } // TODO: ------------ end:二次开发代码 ----------------- // // -------------------------------------------------------------------------------------------- private GlobalConfiguration() {} // -------------------------------------------------------------------------------------------- /** * Loads the global configuration from the environment. Fails if an error occurs during loading. * Returns an empty configuration object if the environment variable is not set. In production * this variable is set but tests and local execution/debugging don't have this environment * variable set. That's why we should fail if it is not set. * * @return Returns the Configuration */ public static Configuration loadConfiguration() { return loadConfiguration(new Configuration()); } /** * Loads the global configuration and adds the given dynamic properties configuration. * * @param dynamicProperties The given dynamic properties * @return Returns the loaded global configuration with dynamic properties */ public static Configuration loadConfiguration(Configuration dynamicProperties) { final String configDir = System.getenv(ConfigConstants.ENV_FLINK_CONF_DIR); if (configDir == null) { return new Configuration(dynamicProperties); } return loadConfiguration(configDir, dynamicProperties); } /** * Loads the configuration files from the specified directory. * *

YAML files are supported as configuration files. * * @param configDir the directory which contains the configuration files */ public static Configuration loadConfiguration(final String configDir) { return loadConfiguration(configDir, null); } /** * Loads the configuration files from the specified directory. If the dynamic properties * configuration is not null, then it is added to the loaded configuration. * * @param configDir directory to load the configuration from * @param dynamicProperties configuration file containing the dynamic properties. Null if none. * @return The configuration loaded from the given configuration directory */ public static Configuration loadConfiguration( final String configDir, @Nullable final Configuration dynamicProperties) { if (configDir == null) { throw new IllegalArgumentException( "Given configuration directory is null, cannot load configuration"); } final File confDirFile = new File(configDir); if (!(confDirFile.exists())) { throw new IllegalConfigurationException( "The given configuration directory name '" + configDir + "' (" + confDirFile.getAbsolutePath() + ") does not describe an existing directory."); } // get Flink yaml configuration file final File yamlConfigFile = new File(confDirFile, FLINK_CONF_FILENAME); if (!yamlConfigFile.exists()) { throw new IllegalConfigurationException( "The Flink config file '" + yamlConfigFile + "' (" + yamlConfigFile.getAbsolutePath() + ") does not exist."); } Configuration configuration = loadYAMLResource(yamlConfigFile); if (dynamicProperties != null) { configuration.addAll(dynamicProperties); } return configuration; } /** * Loads a YAML-file of key-value pairs. * *

Colon and whitespace ": " separate key and value (one per line). The hash tag "#" starts a * single-line comment. * *

Example: * *

     * jobmanager.rpc.address: localhost # network address for communication with the job manager
     * jobmanager.rpc.port   : 6123      # network port to connect to for communication with the job manager
     * taskmanager.rpc.port  : 6122      # network port the task manager expects incoming IPC connections
     * 
* *

This does not span the whole YAML specification, but only the *syntax* of simple YAML * key-value pairs (see issue #113 on GitHub). If at any point in time, there is a need to go * beyond simple key-value pairs syntax compatibility will allow to introduce a YAML parser * library. * * @param file the YAML file to read from * @see YAML 1.2 specification */ private static Configuration loadYAMLResource(File file) { final Configuration config = new Configuration(); Method setSetting = null; try { Class env = Class.forName("org.apache.flink.runtime.util.EnvironmentInformation"); setSetting = env.getMethod("setSetting", String.class, String.class); } catch (Exception e) { LOG.error("获取EnvironmentInformation.setSetting()失败", e); } try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)))) { String line; int lineNo = 0; while ((line = reader.readLine()) != null) { lineNo++; // 1. check for comments String[] comments = line.split("#", 2); String conf = comments[0].trim(); // 2. get key and value if (conf.length() > 0) { String[] kv = conf.split(": ", 2); // skip line with no valid key-value pair if (kv.length == 1) { LOG.warn( "Error while trying to split key and value in configuration file " + file + ":" + lineNo + ": \"" + line + "\""); continue; } String key = kv[0].trim(); String value = kv[1].trim(); // sanity check if (key.length() == 0 || value.length() == 0) { LOG.warn( "Error after splitting key and value in configuration file " + file + ":" + lineNo + ": \"" + line + "\""); continue; } LOG.info( "Loading configuration property: {}, {}", key, isSensitive(key) ? HIDDEN_CONTENT : value); config.setString(key, value); // TODO: ------------ start:二次开发代码 --------------- // setSetting.invoke(null, key, value); // TODO: ------------ end:二次开发代码 --------------- // } } } catch (Exception e) { throw new RuntimeException("Error parsing YAML configuration.", e); } // TODO: ------------ start:二次开发代码 --------------- // fireBootstrap(config); // TODO: ------------ end:二次开发代码 --------------- // return config; } // TODO: ------------ start:二次开发代码 --------------- // /** * fire框架相关初始化动作 */ private static void fireBootstrap(Configuration config) { if (isStart.compareAndSet(false, true)) { // 加载必要的配置文件 loadTaskConfiguration(config); } } /** * 获取当前任务运行模式 */ public static String getRunMode() { return runMode; } /** * 加载必要的配置文件 */ private static void loadTaskConfiguration(Configuration config) { // 用于加载任务同名配置文件中的flink参数 // 获取当前任务的类名称 String className = config.getString("$internal.application.main", config.getString("flink.fire.className", "")); // 获取当前任务的运行模式:yarn-application或yarn-per-job runMode = config.getString("flink.execution.target", config.getString("execution.target", "")); try { Class env = Class.forName("org.apache.flink.runtime.util.EnvironmentInformation"); Method method = env.getMethod("isJobManager"); isJobManager = Boolean.valueOf(method.invoke(null) + ""); } catch (Exception e) { LOG.error("调用EnvironmentInformation.isJobManager()失败", e); } // 配置信息仅在JobManager端进行加载,TaskManager端会被主动的merge if (isJobManager && className != null && className.contains(".")) { String simpleClassName = className.substring(className.lastIndexOf('.') + 1); if (simpleClassName.length() > 0) { PropUtils.setProperty("driver.class.name", className); // TODO: 判断批处理模式,并加载对应配置文件 // PropUtils.load(FireFrameworkConf.FLINK_BATCH_CONF_FILE) PropUtils.loadFile(FireFrameworkConf.FLINK_STREAMING_CONF_FILE()); // 将所有configuration信息同步到PropUtils中 PropUtils.setProperties(config.confData); // 加载用户公共配置文件 PropUtils.load(FireFrameworkConf.userCommonConf()); // 加载任务同名的配置文件 // PropUtils.loadJobConf(className); // 构建fire rest接口地址 PropUtils.setProperty(FireFrameworkConf.FIRE_REST_URL(), "http://" + OSUtils.getIp() + ":" + getRestPort()); // 加载外部系统配置信息,覆盖同名配置文件中的配置,实现动态替换 PropUtils.loadJobConf(className); PropUtils.setProperty("flink.run.mode", runMode); Map settingMap = (Map) JavaConversions.mapAsJavaMap(PropUtils.settings()); settingMap.forEach((k, v) -> { config.setString(k, v); settings.put(k, v); }); } } } /** * Check whether the key is a hidden key. * * @param key the config key */ public static boolean isSensitive(String key) { Preconditions.checkNotNull(key, "key is null"); final String keyInLower = key.toLowerCase(); // 用于隐藏webui中敏感信息 String hideKeys = ((Map) JavaConversions.mapAsJavaMap(PropUtils.settings())).getOrDefault("fire.conf.print.blacklist", "password,secret,fs.azure.account.key"); if (hideKeys != null && hideKeys.length() > 0) { String[] hideKeyArr = hideKeys.split(","); for (String hideKey : hideKeyArr) { if (keyInLower.length() >= hideKey.length() && keyInLower.contains(hideKey)) { return true; } } } return false; } // TODO: ------------ end:二次开发代码 ----------------- // } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.13/org/apache/flink/contrib/streaming/state/EmbeddedRocksDBStateBackend.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.contrib.streaming.state; import com.zto.fire.common.util.PropUtils; import org.apache.commons.lang3.StringUtils; import org.apache.curator.framework.CuratorFramework; import org.apache.curator.framework.CuratorFrameworkFactory; import org.apache.curator.framework.recipes.atomic.AtomicValue; import org.apache.curator.framework.recipes.atomic.DistributedAtomicInteger; import org.apache.curator.retry.ExponentialBackoffRetry; import org.apache.curator.retry.RetryOneTime; import org.apache.flink.annotation.PublicEvolving; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.ExecutionConfig; import org.apache.flink.api.common.JobID; import org.apache.flink.api.common.typeutils.TypeSerializer; import org.apache.flink.configuration.CheckpointingOptions; import org.apache.flink.configuration.IllegalConfigurationException; import org.apache.flink.configuration.ReadableConfig; import org.apache.flink.configuration.TaskManagerOptions; import org.apache.flink.core.fs.CloseableRegistry; import org.apache.flink.core.fs.Path; import org.apache.flink.metrics.MetricGroup; import org.apache.flink.runtime.execution.Environment; import org.apache.flink.runtime.memory.OpaqueMemoryResource; import org.apache.flink.runtime.query.TaskKvStateRegistry; import org.apache.flink.runtime.state.AbstractKeyedStateBackend; import org.apache.flink.runtime.state.AbstractManagedMemoryStateBackend; import org.apache.flink.runtime.state.ConfigurableStateBackend; import org.apache.flink.runtime.state.DefaultOperatorStateBackendBuilder; import org.apache.flink.runtime.state.KeyGroupRange; import org.apache.flink.runtime.state.KeyedStateHandle; import org.apache.flink.runtime.state.LocalRecoveryConfig; import org.apache.flink.runtime.state.OperatorStateBackend; import org.apache.flink.runtime.state.OperatorStateHandle; import org.apache.flink.runtime.state.StreamCompressionDecorator; import org.apache.flink.runtime.state.metrics.LatencyTrackingStateConfig; import org.apache.flink.runtime.state.ttl.TtlTimeProvider; import org.apache.flink.util.AbstractID; import org.apache.flink.util.DynamicCodeLoadingException; import org.apache.flink.util.FileUtils; import org.apache.flink.util.FlinkRuntimeException; import org.apache.flink.util.Preconditions; import org.apache.flink.util.TernaryBoolean; import org.rocksdb.NativeLibraryLoader; import org.rocksdb.RocksDB; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nonnull; import javax.annotation.Nullable; import java.io.File; import java.io.IOException; import java.lang.reflect.Field; import java.net.URI; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Random; import java.util.UUID; import static org.apache.flink.contrib.streaming.state.RocksDBConfigurableOptions.WRITE_BATCH_SIZE; import static org.apache.flink.contrib.streaming.state.RocksDBOptions.CHECKPOINT_TRANSFER_THREAD_NUM; import static org.apache.flink.contrib.streaming.state.RocksDBOptions.TIMER_SERVICE_FACTORY; import static org.apache.flink.util.Preconditions.checkArgument; import static org.apache.flink.util.Preconditions.checkNotNull; /** * A {@link org.apache.flink.runtime.state.StateBackend} that stores its state in an embedded {@code * RocksDB} instance. This state backend can store very large state that exceeds memory and spills * to local disk. All key/value state (including windows) is stored in the key/value index of * RocksDB. For persistence against loss of machines, please configure a {@link * org.apache.flink.runtime.state.CheckpointStorage} instance for the Job. * *

The behavior of the RocksDB instances can be parametrized by setting RocksDB Options using the * methods {@link #setPredefinedOptions(PredefinedOptions)} and {@link * #setRocksDBOptions(RocksDBOptionsFactory)}. */ @PublicEvolving public class EmbeddedRocksDBStateBackend extends AbstractManagedMemoryStateBackend implements ConfigurableStateBackend { /** The options to chose for the type of priority queue state. */ public enum PriorityQueueStateType { HEAP, ROCKSDB } private static final long serialVersionUID = 1L; private static final Logger LOG = LoggerFactory.getLogger(EmbeddedRocksDBStateBackend.class); /** The number of (re)tries for loading the RocksDB JNI library. */ private static final int ROCKSDB_LIB_LOADING_ATTEMPTS = 3; /** Flag whether the native library has been loaded. */ private static boolean rocksDbInitialized = false; private static final int UNDEFINED_NUMBER_OF_TRANSFER_THREADS = -1; private static final long UNDEFINED_WRITE_BATCH_SIZE = -1; // ------------------------------------------------------------------------ // -- configuration values, set in the application / configuration /** * Base paths for RocksDB directory, as configured. Null if not yet set, in which case the * configuration values will be used. The configuration defaults to the TaskManager's temp * directories. */ @Nullable private File[] localRocksDbDirectories; /** The pre-configured option settings. */ @Nullable private PredefinedOptions predefinedOptions; /** The options factory to create the RocksDB options in the cluster. */ @Nullable private RocksDBOptionsFactory rocksDbOptionsFactory; /** This determines if incremental checkpointing is enabled. */ private final TernaryBoolean enableIncrementalCheckpointing; /** Thread number used to transfer (download and upload) state, default value: 1. */ private int numberOfTransferThreads; /** The configuration for memory settings (pool sizes, etc.). */ private final RocksDBMemoryConfiguration memoryConfiguration; /** This determines the type of priority queue state. */ @Nullable private EmbeddedRocksDBStateBackend.PriorityQueueStateType priorityQueueStateType; /** The default rocksdb metrics options. */ private final RocksDBNativeMetricOptions defaultMetricOptions; // -- runtime values, set on TaskManager when initializing / using the backend /** Base paths for RocksDB directory, as initialized. */ private transient File[] initializedDbBasePaths; /** JobID for uniquifying backup paths. */ private transient JobID jobId; /** The index of the next directory to be used from {@link #initializedDbBasePaths}. */ private transient int nextDirectory; /** Whether we already lazily initialized our local storage directories. */ private transient boolean isInitialized; /** * Max consumed memory size for one batch in {@link RocksDBWriteBatchWrapper}, default value * 2mb. */ private long writeBatchSize; // ------------------------------------------------------------------------ // TODO: ------------ start:二次开发代码 --------------- // /** * State disk choose policy */ private static final String FLINK_STATE_DISK_CHOOSE_POLICY_ROUND_ROBIN = "ROUND_ROBIN"; /** * Default state disk choose policy */ private static final String FLINK_STATE_DISK_CHOOSE_POLICY_DEFAULT = "DEFAULT"; /** * distributed dir on each taskManager */ private DistributedAtomicInteger dirIndex; /** * state choose disk policy */ private String stateDiskPolicy; private transient CuratorFramework client; private String currentHostName; // 初始化标识,避免多次初始化 private boolean isInitZKClient = false; // 用于统计磁盘负载的zk地址 private final static String STATE_ZOOKEEPER_URL = "flink.state.external.zookeeper.url"; // 状态本地磁盘路径选取策略:default/round_robin private final static String STATE_CHOOSE_DISK_POLICY = "flink.state.choose.disk.policy"; /** * 初始化round_robin策略下的zookeeper连接 */ private void initZKClient() { synchronized (EmbeddedRocksDBStateBackend.class) { if (isInitZKClient) return; this.isInitZKClient = true; final String zkUrl = PropUtils.getString(STATE_ZOOKEEPER_URL, ""); this.stateDiskPolicy = PropUtils.getString(STATE_CHOOSE_DISK_POLICY, FLINK_STATE_DISK_CHOOSE_POLICY_DEFAULT).toUpperCase(); LOG.info("当前磁盘路径选择策略:" + this.stateDiskPolicy); // 如果zk地址不为空,并且开启了ROUND_ROBIN磁盘路径选择策略,则建立zookeeper的连接,避免太多任务建立太多的连接 if (StringUtils.isNotBlank(zkUrl) && this.isRoundRobin()) { try { LOG.info("开启基于zookeeper的本地磁盘状态路径选择策略"); this.client = CuratorFrameworkFactory.builder().connectString(zkUrl) .connectionTimeoutMs(5000).retryPolicy(new RetryOneTime(5000)).build(); this.client.start(); Runtime.getRuntime().addShutdownHook(new Thread(() -> { if (client != null) { client.close(); LOG.info("释放基于zookeeper的本地磁盘状态路径选择策略的连接"); } })); } catch (Exception e) { LOG.error("初始化CuratorFrameworkFactory失败", e); } } } } /** * 判断是否为ROUND_ROBIN模式 */ private boolean isRoundRobin() { if (!this.isInitZKClient) this.initZKClient(); return FLINK_STATE_DISK_CHOOSE_POLICY_ROUND_ROBIN.equalsIgnoreCase(this.stateDiskPolicy); } // TODO: ------------ end:二次开发代码 --------------- // /** Creates a new {@code EmbeddedRocksDBStateBackend} for storing local state. */ public EmbeddedRocksDBStateBackend() { this(TernaryBoolean.UNDEFINED); } /** * Creates a new {@code EmbeddedRocksDBStateBackend} for storing local state. * * @param enableIncrementalCheckpointing True if incremental checkpointing is enabled. */ public EmbeddedRocksDBStateBackend(boolean enableIncrementalCheckpointing) { this(TernaryBoolean.fromBoolean(enableIncrementalCheckpointing)); } /** * Creates a new {@code EmbeddedRocksDBStateBackend} for storing local state. * * @param enableIncrementalCheckpointing True if incremental checkpointing is enabled. */ public EmbeddedRocksDBStateBackend(TernaryBoolean enableIncrementalCheckpointing) { this.enableIncrementalCheckpointing = enableIncrementalCheckpointing; this.numberOfTransferThreads = UNDEFINED_NUMBER_OF_TRANSFER_THREADS; this.defaultMetricOptions = new RocksDBNativeMetricOptions(); this.memoryConfiguration = new RocksDBMemoryConfiguration(); this.writeBatchSize = UNDEFINED_WRITE_BATCH_SIZE; // TODO: ------------ start:二次开发代码 --------------- // this.initZKClient(); // TODO: ------------ end:二次开发代码 --------------- // } /** * Private constructor that creates a re-configured copy of the state backend. * * @param original The state backend to re-configure. * @param config The configuration. * @param classLoader The class loader. */ private EmbeddedRocksDBStateBackend( EmbeddedRocksDBStateBackend original, ReadableConfig config, ClassLoader classLoader) { // configure incremental checkpoints this.enableIncrementalCheckpointing = original.enableIncrementalCheckpointing.resolveUndefined( config.get(CheckpointingOptions.INCREMENTAL_CHECKPOINTS)); if (original.numberOfTransferThreads == UNDEFINED_NUMBER_OF_TRANSFER_THREADS) { this.numberOfTransferThreads = config.get(CHECKPOINT_TRANSFER_THREAD_NUM); } else { this.numberOfTransferThreads = original.numberOfTransferThreads; } if (original.writeBatchSize == UNDEFINED_WRITE_BATCH_SIZE) { this.writeBatchSize = config.get(WRITE_BATCH_SIZE).getBytes(); } else { this.writeBatchSize = original.writeBatchSize; } this.memoryConfiguration = RocksDBMemoryConfiguration.fromOtherAndConfiguration( original.memoryConfiguration, config); this.memoryConfiguration.validate(); if (null == original.priorityQueueStateType) { this.priorityQueueStateType = config.get(TIMER_SERVICE_FACTORY); } else { this.priorityQueueStateType = original.priorityQueueStateType; } // configure local directories if (original.localRocksDbDirectories != null) { this.localRocksDbDirectories = original.localRocksDbDirectories; } else { final String rocksdbLocalPaths = config.get(RocksDBOptions.LOCAL_DIRECTORIES); if (rocksdbLocalPaths != null) { String[] directories = rocksdbLocalPaths.split(",|" + File.pathSeparator); try { setDbStoragePaths(directories); } catch (IllegalArgumentException e) { throw new IllegalConfigurationException( "Invalid configuration for RocksDB state " + "backend's local storage directories: " + e.getMessage(), e); } } } // configure metric options this.defaultMetricOptions = RocksDBNativeMetricOptions.fromConfig(config); // configure RocksDB predefined options this.predefinedOptions = original.predefinedOptions == null ? PredefinedOptions.valueOf(config.get(RocksDBOptions.PREDEFINED_OPTIONS)) : original.predefinedOptions; LOG.info("Using predefined options: {}.", predefinedOptions.name()); // configure RocksDB options factory try { rocksDbOptionsFactory = configureOptionsFactory( original.rocksDbOptionsFactory, config.get(RocksDBOptions.OPTIONS_FACTORY), config, classLoader); } catch (DynamicCodeLoadingException e) { throw new FlinkRuntimeException(e); } // configure latency tracking latencyTrackingConfigBuilder = original.latencyTrackingConfigBuilder.configure(config); } // ------------------------------------------------------------------------ // Reconfiguration // ------------------------------------------------------------------------ /** * Creates a copy of this state backend that uses the values defined in the configuration for * fields where that were not yet specified in this state backend. * * @param config The configuration. * @param classLoader The class loader. * @return The re-configured variant of the state backend */ @Override public EmbeddedRocksDBStateBackend configure(ReadableConfig config, ClassLoader classLoader) { return new EmbeddedRocksDBStateBackend(this, config, classLoader); } // ------------------------------------------------------------------------ // State backend methods // ------------------------------------------------------------------------ private void lazyInitializeForJob( Environment env, @SuppressWarnings("unused") String operatorIdentifier) throws IOException { if (isInitialized) { return; } this.jobId = env.getJobID(); // initialize the paths where the local RocksDB files should be stored if (localRocksDbDirectories == null) { // initialize from the temp directories initializedDbBasePaths = env.getIOManager().getSpillingDirectories(); } else { List dirs = new ArrayList<>(localRocksDbDirectories.length); StringBuilder errorMessage = new StringBuilder(); for (File f : localRocksDbDirectories) { File testDir = new File(f, UUID.randomUUID().toString()); if (!testDir.mkdirs()) { String msg = "Local DB files directory '" + f + "' does not exist and cannot be created. "; LOG.error(msg); errorMessage.append(msg); } else { dirs.add(f); } //noinspection ResultOfMethodCallIgnored testDir.delete(); } if (dirs.isEmpty()) { throw new IOException("No local storage directories available. " + errorMessage); } else { initializedDbBasePaths = dirs.toArray(new File[0]); } } // TODO: ------------ start:二次开发代码 --------------- // if (isRoundRobin()) { this.currentHostName = env.getTaskManagerInfo().getConfiguration().getString( TaskManagerOptions.HOST); } // TODO: ------------ end:二次开发代码 --------------- // nextDirectory = new Random().nextInt(initializedDbBasePaths.length); isInitialized = true; } private File getNextStoragePath() { // TODO: ------------ start:二次开发代码 --------------- // int ni = nextDirectory; if (isRoundRobin()) { try { String counterPath = "/rocksDB/" + this.currentHostName; ExponentialBackoffRetry retryPolicy = new ExponentialBackoffRetry(1000, 10); this.dirIndex = new DistributedAtomicInteger(this.client, counterPath, retryPolicy); this.dirIndex.initialize(0); AtomicValue value = this.dirIndex.increment(); if (value.succeeded()) { ni = value.postValue() % initializedDbBasePaths.length; } else { ni = new Random().nextInt(initializedDbBasePaths.length); } } catch (Exception e) { ni = new Random().nextInt(initializedDbBasePaths.length); LOG.error("基于zookeeper的本地状态磁盘路径选择发生异常,请在commons.properties文件中指定以下参数恢复到flink默认的选择策略:flink.state.choose.disk.policy=default", e); } } else { ni = nextDirectory + 1; ni = ni >= initializedDbBasePaths.length ? 0 : ni; nextDirectory = ni; } LOG.info("Next state file storage path is: " + initializedDbBasePaths[ni].getPath()); // TODO: ------------ end:二次开发代码 --------------- // return initializedDbBasePaths[ni]; } // ------------------------------------------------------------------------ // State holding data structures // ------------------------------------------------------------------------ @Override public AbstractKeyedStateBackend createKeyedStateBackend( Environment env, JobID jobID, String operatorIdentifier, TypeSerializer keySerializer, int numberOfKeyGroups, KeyGroupRange keyGroupRange, TaskKvStateRegistry kvStateRegistry, TtlTimeProvider ttlTimeProvider, MetricGroup metricGroup, @Nonnull Collection stateHandles, CloseableRegistry cancelStreamRegistry) throws IOException { return createKeyedStateBackend( env, jobID, operatorIdentifier, keySerializer, numberOfKeyGroups, keyGroupRange, kvStateRegistry, ttlTimeProvider, metricGroup, stateHandles, cancelStreamRegistry, 1.0); } @Override public AbstractKeyedStateBackend createKeyedStateBackend( Environment env, JobID jobID, String operatorIdentifier, TypeSerializer keySerializer, int numberOfKeyGroups, KeyGroupRange keyGroupRange, TaskKvStateRegistry kvStateRegistry, TtlTimeProvider ttlTimeProvider, MetricGroup metricGroup, @Nonnull Collection stateHandles, CloseableRegistry cancelStreamRegistry, double managedMemoryFraction) throws IOException { // first, make sure that the RocksDB JNI library is loaded // we do this explicitly here to have better error handling String tempDir = env.getTaskManagerInfo().getTmpDirectories()[0]; ensureRocksDBIsLoaded(tempDir); // replace all characters that are not legal for filenames with underscore String fileCompatibleIdentifier = operatorIdentifier.replaceAll("[^a-zA-Z0-9\\-]", "_"); lazyInitializeForJob(env, fileCompatibleIdentifier); File instanceBasePath = new File( getNextStoragePath(), "job_" + jobId + "_op_" + fileCompatibleIdentifier + "_uuid_" + UUID.randomUUID()); LocalRecoveryConfig localRecoveryConfig = env.getTaskStateManager().createLocalRecoveryConfig(); final OpaqueMemoryResource sharedResources = RocksDBOperationUtils.allocateSharedCachesIfConfigured( memoryConfiguration, env.getMemoryManager(), managedMemoryFraction, LOG); if (sharedResources != null) { LOG.info("Obtained shared RocksDB cache of size {} bytes", sharedResources.getSize()); } final RocksDBResourceContainer resourceContainer = createOptionsAndResourceContainer(sharedResources); ExecutionConfig executionConfig = env.getExecutionConfig(); StreamCompressionDecorator keyGroupCompressionDecorator = getCompressionDecorator(executionConfig); LatencyTrackingStateConfig latencyTrackingStateConfig = latencyTrackingConfigBuilder.setMetricGroup(metricGroup).build(); RocksDBKeyedStateBackendBuilder builder = new RocksDBKeyedStateBackendBuilder<>( operatorIdentifier, env.getUserCodeClassLoader().asClassLoader(), instanceBasePath, resourceContainer, stateName -> resourceContainer.getColumnOptions(), kvStateRegistry, keySerializer, numberOfKeyGroups, keyGroupRange, executionConfig, localRecoveryConfig, getPriorityQueueStateType(), ttlTimeProvider, latencyTrackingStateConfig, metricGroup, stateHandles, keyGroupCompressionDecorator, cancelStreamRegistry) .setEnableIncrementalCheckpointing(isIncrementalCheckpointsEnabled()) .setNumberOfTransferingThreads(getNumberOfTransferThreads()) .setNativeMetricOptions( resourceContainer.getMemoryWatcherOptions(defaultMetricOptions)) .setWriteBatchSize(getWriteBatchSize()); return builder.build(); } @Override public OperatorStateBackend createOperatorStateBackend( Environment env, String operatorIdentifier, @Nonnull Collection stateHandles, CloseableRegistry cancelStreamRegistry) throws Exception { // the default for RocksDB; eventually there can be a operator state backend based on // RocksDB, too. final boolean asyncSnapshots = true; return new DefaultOperatorStateBackendBuilder( env.getUserCodeClassLoader().asClassLoader(), env.getExecutionConfig(), asyncSnapshots, stateHandles, cancelStreamRegistry) .build(); } private RocksDBOptionsFactory configureOptionsFactory( @Nullable RocksDBOptionsFactory originalOptionsFactory, String factoryClassName, ReadableConfig config, ClassLoader classLoader) throws DynamicCodeLoadingException { if (originalOptionsFactory != null) { if (originalOptionsFactory instanceof ConfigurableRocksDBOptionsFactory) { originalOptionsFactory = ((ConfigurableRocksDBOptionsFactory) originalOptionsFactory) .configure(config); } LOG.info("Using application-defined options factory: {}.", originalOptionsFactory); return originalOptionsFactory; } // if using DefaultConfigurableOptionsFactory by default, we could avoid reflection to speed // up. if (factoryClassName.equalsIgnoreCase(DefaultConfigurableOptionsFactory.class.getName())) { DefaultConfigurableOptionsFactory optionsFactory = new DefaultConfigurableOptionsFactory(); optionsFactory.configure(config); LOG.info("Using default options factory: {}.", optionsFactory); return optionsFactory; } else { try { Class clazz = Class.forName(factoryClassName, false, classLoader) .asSubclass(RocksDBOptionsFactory.class); RocksDBOptionsFactory optionsFactory = clazz.newInstance(); if (optionsFactory instanceof ConfigurableRocksDBOptionsFactory) { optionsFactory = ((ConfigurableRocksDBOptionsFactory) optionsFactory).configure(config); } LOG.info("Using configured options factory: {}.", optionsFactory); return optionsFactory; } catch (ClassNotFoundException e) { throw new DynamicCodeLoadingException( "Cannot find configured options factory class: " + factoryClassName, e); } catch (ClassCastException | InstantiationException | IllegalAccessException e) { throw new DynamicCodeLoadingException( "The class configured under '" + RocksDBOptions.OPTIONS_FACTORY.key() + "' is not a valid options factory (" + factoryClassName + ')', e); } } } // ------------------------------------------------------------------------ // Parameters // ------------------------------------------------------------------------ /** * Gets the memory configuration object, which offers settings to control RocksDB's memory * usage. */ public RocksDBMemoryConfiguration getMemoryConfiguration() { return memoryConfiguration; } /** * Sets the path where the RocksDB local database files should be stored on the local file * system. Setting this path overrides the default behavior, where the files are stored across * the configured temp directories. * *

Passing {@code null} to this function restores the default behavior, where the configured * temp directories will be used. * * @param path The path where the local RocksDB database files are stored. */ public void setDbStoragePath(String path) { setDbStoragePaths(path == null ? null : new String[] {path}); } /** * Sets the directories in which the local RocksDB database puts its files (like SST and * metadata files). These directories do not need to be persistent, they can be ephemeral, * meaning that they are lost on a machine failure, because state in RocksDB is persisted in * checkpoints. * *

If nothing is configured, these directories default to the TaskManager's local temporary * file directories. * *

Each distinct state will be stored in one path, but when the state backend creates * multiple states, they will store their files on different paths. * *

Passing {@code null} to this function restores the default behavior, where the configured * temp directories will be used. * * @param paths The paths across which the local RocksDB database files will be spread. */ public void setDbStoragePaths(String... paths) { if (paths == null) { localRocksDbDirectories = null; } else if (paths.length == 0) { throw new IllegalArgumentException("empty paths"); } else { File[] pp = new File[paths.length]; for (int i = 0; i < paths.length; i++) { final String rawPath = paths[i]; final String path; if (rawPath == null) { throw new IllegalArgumentException("null path"); } else { // we need this for backwards compatibility, to allow URIs like 'file:///'... URI uri = null; try { uri = new Path(rawPath).toUri(); } catch (Exception e) { // cannot parse as a path } if (uri != null && uri.getScheme() != null) { if ("file".equalsIgnoreCase(uri.getScheme())) { path = uri.getPath(); } else { throw new IllegalArgumentException( "Path " + rawPath + " has a non-local scheme"); } } else { path = rawPath; } } pp[i] = new File(path); if (!pp[i].isAbsolute()) { throw new IllegalArgumentException("Relative paths are not supported"); } } localRocksDbDirectories = pp; } } /** * Gets the configured local DB storage paths, or null, if none were configured. * *

Under these directories on the TaskManager, RocksDB stores its SST files and metadata * files. These directories do not need to be persistent, they can be ephermeral, meaning that * they are lost on a machine failure, because state in RocksDB is persisted in checkpoints. * *

If nothing is configured, these directories default to the TaskManager's local temporary * file directories. */ public String[] getDbStoragePaths() { if (localRocksDbDirectories == null) { return null; } else { String[] paths = new String[localRocksDbDirectories.length]; for (int i = 0; i < paths.length; i++) { paths[i] = localRocksDbDirectories[i].toString(); } return paths; } } /** Gets whether incremental checkpoints are enabled for this state backend. */ public boolean isIncrementalCheckpointsEnabled() { return enableIncrementalCheckpointing.getOrDefault( CheckpointingOptions.INCREMENTAL_CHECKPOINTS.defaultValue()); } /** * Gets the type of the priority queue state. It will fallback to the default value, if it is * not explicitly set. * * @return The type of the priority queue state. */ public EmbeddedRocksDBStateBackend.PriorityQueueStateType getPriorityQueueStateType() { return priorityQueueStateType == null ? TIMER_SERVICE_FACTORY.defaultValue() : priorityQueueStateType; } /** * Sets the type of the priority queue state. It will fallback to the default value, if it is * not explicitly set. */ public void setPriorityQueueStateType( EmbeddedRocksDBStateBackend.PriorityQueueStateType priorityQueueStateType) { this.priorityQueueStateType = checkNotNull(priorityQueueStateType); } // ------------------------------------------------------------------------ // Parametrize with RocksDB Options // ------------------------------------------------------------------------ /** * Sets the predefined options for RocksDB. * *

If user-configured options within {@link RocksDBConfigurableOptions} is set (through * flink-conf.yaml) or a user-defined options factory is set (via {@link * #setRocksDBOptions(RocksDBOptionsFactory)}), then the options from the factory are applied on * top of the here specified predefined options and customized options. * * @param options The options to set (must not be null). */ public void setPredefinedOptions(@Nonnull PredefinedOptions options) { predefinedOptions = checkNotNull(options); } /** * Gets the currently set predefined options for RocksDB. The default options (if nothing was * set via {@link #setPredefinedOptions(PredefinedOptions)}) are {@link * PredefinedOptions#DEFAULT}. * *

If user-configured options within {@link RocksDBConfigurableOptions} is set (through * flink-conf.yaml) of a user-defined options factory is set (via {@link * #setRocksDBOptions(RocksDBOptionsFactory)}), then the options from the factory are applied on * top of the predefined and customized options. * * @return The currently set predefined options for RocksDB. */ @VisibleForTesting public PredefinedOptions getPredefinedOptions() { if (predefinedOptions == null) { predefinedOptions = PredefinedOptions.DEFAULT; } return predefinedOptions; } /** * Sets {@link org.rocksdb.Options} for the RocksDB instances. Because the options are not * serializable and hold native code references, they must be specified through a factory. * *

The options created by the factory here are applied on top of the pre-defined options * profile selected via {@link #setPredefinedOptions(PredefinedOptions)}. If the pre-defined * options profile is the default ({@link PredefinedOptions#DEFAULT}), then the factory fully * controls the RocksDB options. * * @param optionsFactory The options factory that lazily creates the RocksDB options. */ public void setRocksDBOptions(RocksDBOptionsFactory optionsFactory) { this.rocksDbOptionsFactory = optionsFactory; } /** * Gets {@link org.rocksdb.Options} for the RocksDB instances. * *

The options created by the factory here are applied on top of the pre-defined options * profile selected via {@link #setPredefinedOptions(PredefinedOptions)}. If the pre-defined * options profile is the default ({@link PredefinedOptions#DEFAULT}), then the factory fully * controls the RocksDB options. */ @Nullable public RocksDBOptionsFactory getRocksDBOptions() { return rocksDbOptionsFactory; } /** Gets the number of threads used to transfer files while snapshotting/restoring. */ public int getNumberOfTransferThreads() { return numberOfTransferThreads == UNDEFINED_NUMBER_OF_TRANSFER_THREADS ? CHECKPOINT_TRANSFER_THREAD_NUM.defaultValue() : numberOfTransferThreads; } /** * Sets the number of threads used to transfer files while snapshotting/restoring. * * @param numberOfTransferThreads The number of threads used to transfer files while * snapshotting/restoring. */ public void setNumberOfTransferThreads(int numberOfTransferThreads) { Preconditions.checkArgument( numberOfTransferThreads > 0, "The number of threads used to transfer files in EmbeddedRocksDBStateBackend should be greater than zero."); this.numberOfTransferThreads = numberOfTransferThreads; } /** Gets the max batch size will be used in {@link RocksDBWriteBatchWrapper}. */ public long getWriteBatchSize() { return writeBatchSize == UNDEFINED_WRITE_BATCH_SIZE ? WRITE_BATCH_SIZE.defaultValue().getBytes() : writeBatchSize; } /** * Sets the max batch size will be used in {@link RocksDBWriteBatchWrapper}, no positive value * will disable memory size controller, just use item count controller. * * @param writeBatchSize The size will used to be used in {@link RocksDBWriteBatchWrapper}. */ public void setWriteBatchSize(long writeBatchSize) { checkArgument(writeBatchSize >= 0, "Write batch size have to be no negative."); this.writeBatchSize = writeBatchSize; } // ------------------------------------------------------------------------ // utilities // ------------------------------------------------------------------------ @VisibleForTesting RocksDBResourceContainer createOptionsAndResourceContainer() { return createOptionsAndResourceContainer(null); } @VisibleForTesting private RocksDBResourceContainer createOptionsAndResourceContainer( @Nullable OpaqueMemoryResource sharedResources) { return new RocksDBResourceContainer( predefinedOptions != null ? predefinedOptions : PredefinedOptions.DEFAULT, rocksDbOptionsFactory, sharedResources); } @Override public String toString() { return "EmbeddedRocksDBStateBackend{" + ", localRocksDbDirectories=" + Arrays.toString(localRocksDbDirectories) + ", enableIncrementalCheckpointing=" + enableIncrementalCheckpointing + ", numberOfTransferThreads=" + numberOfTransferThreads + ", writeBatchSize=" + writeBatchSize + '}'; } // ------------------------------------------------------------------------ // static library loading utilities // ------------------------------------------------------------------------ @VisibleForTesting static void ensureRocksDBIsLoaded(String tempDirectory) throws IOException { synchronized (EmbeddedRocksDBStateBackend.class) { if (!rocksDbInitialized) { final File tempDirParent = new File(tempDirectory).getAbsoluteFile(); LOG.info( "Attempting to load RocksDB native library and store it under '{}'", tempDirParent); Throwable lastException = null; for (int attempt = 1; attempt <= ROCKSDB_LIB_LOADING_ATTEMPTS; attempt++) { File rocksLibFolder = null; try { // when multiple instances of this class and RocksDB exist in different // class loaders, then we can see the following exception: // "java.lang.UnsatisfiedLinkError: Native Library // /path/to/temp/dir/librocksdbjni-linux64.so // already loaded in another class loader" // to avoid that, we need to add a random element to the library file path // (I know, seems like an unnecessary hack, since the JVM obviously can // handle multiple // instances of the same JNI library being loaded in different class // loaders, but // apparently not when coming from the same file path, so there we go) rocksLibFolder = new File(tempDirParent, "rocksdb-lib-" + new AbstractID()); // make sure the temp path exists LOG.debug( "Attempting to create RocksDB native library folder {}", rocksLibFolder); // noinspection ResultOfMethodCallIgnored rocksLibFolder.mkdirs(); // explicitly load the JNI dependency if it has not been loaded before NativeLibraryLoader.getInstance() .loadLibrary(rocksLibFolder.getAbsolutePath()); // this initialization here should validate that the loading succeeded RocksDB.loadLibrary(); // seems to have worked LOG.info("Successfully loaded RocksDB native library"); rocksDbInitialized = true; return; } catch (Throwable t) { lastException = t; LOG.debug("RocksDB JNI library loading attempt {} failed", attempt, t); // try to force RocksDB to attempt reloading the library try { resetRocksDBLoadedFlag(); } catch (Throwable tt) { LOG.debug( "Failed to reset 'initialized' flag in RocksDB native code loader", tt); } FileUtils.deleteDirectoryQuietly(rocksLibFolder); } } throw new IOException("Could not load the native RocksDB library", lastException); } } } @VisibleForTesting static void resetRocksDBLoadedFlag() throws Exception { final Field initField = org.rocksdb.NativeLibraryLoader.class.getDeclaredField("initialized"); initField.setAccessible(true); initField.setBoolean(null, false); } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.13/org/apache/flink/runtime/checkpoint/CheckpointCoordinator.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.runtime.checkpoint; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.JobID; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.runtime.checkpoint.CheckpointType.PostCheckpointAction; import org.apache.flink.runtime.checkpoint.hooks.MasterHooks; import org.apache.flink.runtime.concurrent.FutureUtils; import org.apache.flink.runtime.concurrent.ScheduledExecutor; import org.apache.flink.runtime.executiongraph.Execution; import org.apache.flink.runtime.executiongraph.ExecutionAttemptID; import org.apache.flink.runtime.executiongraph.ExecutionJobVertex; import org.apache.flink.runtime.executiongraph.ExecutionVertex; import org.apache.flink.runtime.executiongraph.JobStatusListener; import org.apache.flink.runtime.jobgraph.JobVertexID; import org.apache.flink.runtime.jobgraph.OperatorID; import org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration; import org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint; import org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint; import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; import org.apache.flink.runtime.operators.coordination.OperatorInfo; import org.apache.flink.runtime.state.CheckpointStorage; import org.apache.flink.runtime.state.CheckpointStorageCoordinatorView; import org.apache.flink.runtime.state.CheckpointStorageLocation; import org.apache.flink.runtime.state.CompletedCheckpointStorageLocation; import org.apache.flink.runtime.state.SharedStateRegistry; import org.apache.flink.runtime.state.SharedStateRegistryFactory; import org.apache.flink.runtime.state.memory.ByteStreamStateHandle; import org.apache.flink.util.ExceptionUtils; import org.apache.flink.util.FlinkRuntimeException; import org.apache.flink.util.Preconditions; import org.apache.flink.util.StringUtils; import org.apache.flink.util.clock.Clock; import org.apache.flink.util.clock.SystemClock; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import javax.annotation.concurrent.GuardedBy; import java.io.IOException; import java.util.ArrayDeque; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.OptionalLong; import java.util.PriorityQueue; import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletionException; import java.util.concurrent.Executor; import java.util.concurrent.RejectedExecutionException; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Predicate; import java.util.stream.Stream; import static java.util.stream.Collectors.toMap; import static org.apache.flink.util.ExceptionUtils.findThrowable; import static org.apache.flink.util.Preconditions.checkArgument; import static org.apache.flink.util.Preconditions.checkNotNull; /** * The checkpoint coordinator coordinates the distributed snapshots of operators and state. It * triggers the checkpoint by sending the messages to the relevant tasks and collects the checkpoint * acknowledgements. It also collects and maintains the overview of the state handles reported by * the tasks that acknowledge the checkpoint. */ public class CheckpointCoordinator { private static final Logger LOG = LoggerFactory.getLogger(CheckpointCoordinator.class); /** The number of recent checkpoints whose IDs are remembered. */ private static final int NUM_GHOST_CHECKPOINT_IDS = 16; // ------------------------------------------------------------------------ /** Coordinator-wide lock to safeguard the checkpoint updates. */ private final Object lock = new Object(); /** The job whose checkpoint this coordinator coordinates. */ private final JobID job; /** Default checkpoint properties. * */ private final CheckpointProperties checkpointProperties; /** The executor used for asynchronous calls, like potentially blocking I/O. */ private final Executor executor; private final CheckpointsCleaner checkpointsCleaner; /** The operator coordinators that need to be checkpointed. */ private final Collection coordinatorsToCheckpoint; /** Map from checkpoint ID to the pending checkpoint. */ @GuardedBy("lock") private final Map pendingCheckpoints; /** * Completed checkpoints. Implementations can be blocking. Make sure calls to methods accessing * this don't block the job manager actor and run asynchronously. */ private final CompletedCheckpointStore completedCheckpointStore; /** * The root checkpoint state backend, which is responsible for initializing the checkpoint, * storing the metadata, and cleaning up the checkpoint. */ private final CheckpointStorageCoordinatorView checkpointStorageView; /** A list of recent checkpoint IDs, to identify late messages (vs invalid ones). */ private final ArrayDeque recentPendingCheckpoints; /** * Checkpoint ID counter to ensure ascending IDs. In case of job manager failures, these need to * be ascending across job managers. */ private final CheckpointIDCounter checkpointIdCounter; // TODO: ------------ start:二次开发代码 --------------- // /** * The base checkpoint interval. Actual trigger time may be affected by the max concurrent * checkpoints and minimum-pause values */ private long baseInterval; /** The max time (in ms) that a checkpoint may take. */ private long checkpointTimeout; /** * The min time(in ms) to delay after a checkpoint could be triggered. Allows to enforce minimum * processing time between checkpoint attempts */ private long minPauseBetweenCheckpoints; public long getBaseInterval() { return baseInterval; } public void setBaseInterval(long baseInterval) { this.baseInterval = baseInterval; } public void setCheckpointTimeout(long checkpointTimeout) { this.checkpointTimeout = checkpointTimeout; } public long getMinPauseBetweenCheckpoints() { return minPauseBetweenCheckpoints; } public void setMinPauseBetweenCheckpoints(long minPauseBetweenCheckpoints) { this.minPauseBetweenCheckpoints = minPauseBetweenCheckpoints; } private static CheckpointCoordinator coordinator; public static CheckpointCoordinator getInstance() { return CheckpointCoordinator.coordinator; } // TODO: ------------ end:二次开发代码 ----------------- // /** * The timer that handles the checkpoint timeouts and triggers periodic checkpoints. It must be * single-threaded. Eventually it will be replaced by main thread executor. */ private final ScheduledExecutor timer; /** The master checkpoint hooks executed by this checkpoint coordinator. */ private final HashMap> masterHooks; private final boolean unalignedCheckpointsEnabled; private final long alignmentTimeout; /** Actor that receives status updates from the execution graph this coordinator works for. */ private JobStatusListener jobStatusListener; /** The number of consecutive failed trigger attempts. */ private final AtomicInteger numUnsuccessfulCheckpointsTriggers = new AtomicInteger(0); /** A handle to the current periodic trigger, to cancel it when necessary. */ private ScheduledFuture currentPeriodicTrigger; /** * The timestamp (via {@link Clock#relativeTimeMillis()}) when the last checkpoint completed. */ private long lastCheckpointCompletionRelativeTime; /** * Flag whether a triggered checkpoint should immediately schedule the next checkpoint. * Non-volatile, because only accessed in synchronized scope */ private boolean periodicScheduling; /** Flag marking the coordinator as shut down (not accepting any messages any more). */ private volatile boolean shutdown; /** Optional tracker for checkpoint statistics. */ @Nullable private CheckpointStatsTracker statsTracker; /** A factory for SharedStateRegistry objects. */ private final SharedStateRegistryFactory sharedStateRegistryFactory; /** Registry that tracks state which is shared across (incremental) checkpoints. */ private SharedStateRegistry sharedStateRegistry; private boolean isPreferCheckpointForRecovery; private final CheckpointFailureManager failureManager; private final Clock clock; private final boolean isExactlyOnceMode; /** Flag represents there is an in-flight trigger request. */ private boolean isTriggering = false; private final CheckpointRequestDecider requestDecider; private final CheckpointPlanCalculator checkpointPlanCalculator; private final ExecutionAttemptMappingProvider attemptMappingProvider; // -------------------------------------------------------------------------------------------- public CheckpointCoordinator( JobID job, CheckpointCoordinatorConfiguration chkConfig, Collection coordinatorsToCheckpoint, CheckpointIDCounter checkpointIDCounter, CompletedCheckpointStore completedCheckpointStore, CheckpointStorage checkpointStorage, Executor executor, CheckpointsCleaner checkpointsCleaner, ScheduledExecutor timer, SharedStateRegistryFactory sharedStateRegistryFactory, CheckpointFailureManager failureManager, CheckpointPlanCalculator checkpointPlanCalculator, ExecutionAttemptMappingProvider attemptMappingProvider) { this( job, chkConfig, coordinatorsToCheckpoint, checkpointIDCounter, completedCheckpointStore, checkpointStorage, executor, checkpointsCleaner, timer, sharedStateRegistryFactory, failureManager, checkpointPlanCalculator, attemptMappingProvider, SystemClock.getInstance()); } @VisibleForTesting public CheckpointCoordinator( JobID job, CheckpointCoordinatorConfiguration chkConfig, Collection coordinatorsToCheckpoint, CheckpointIDCounter checkpointIDCounter, CompletedCheckpointStore completedCheckpointStore, CheckpointStorage checkpointStorage, Executor executor, CheckpointsCleaner checkpointsCleaner, ScheduledExecutor timer, SharedStateRegistryFactory sharedStateRegistryFactory, CheckpointFailureManager failureManager, CheckpointPlanCalculator checkpointPlanCalculator, ExecutionAttemptMappingProvider attemptMappingProvider, Clock clock) { // sanity checks checkNotNull(checkpointStorage); // max "in between duration" can be one year - this is to prevent numeric overflows long minPauseBetweenCheckpoints = chkConfig.getMinPauseBetweenCheckpoints(); if (minPauseBetweenCheckpoints > 365L * 24 * 60 * 60 * 1_000) { minPauseBetweenCheckpoints = 365L * 24 * 60 * 60 * 1_000; } // it does not make sense to schedule checkpoints more often then the desired // time between checkpoints long baseInterval = chkConfig.getCheckpointInterval(); if (baseInterval < minPauseBetweenCheckpoints) { baseInterval = minPauseBetweenCheckpoints; } this.job = checkNotNull(job); this.baseInterval = baseInterval; this.checkpointTimeout = chkConfig.getCheckpointTimeout(); this.minPauseBetweenCheckpoints = minPauseBetweenCheckpoints; this.coordinatorsToCheckpoint = Collections.unmodifiableCollection(coordinatorsToCheckpoint); this.pendingCheckpoints = new LinkedHashMap<>(); this.checkpointIdCounter = checkNotNull(checkpointIDCounter); this.completedCheckpointStore = checkNotNull(completedCheckpointStore); this.executor = checkNotNull(executor); this.checkpointsCleaner = checkNotNull(checkpointsCleaner); this.sharedStateRegistryFactory = checkNotNull(sharedStateRegistryFactory); this.sharedStateRegistry = sharedStateRegistryFactory.create(executor); this.isPreferCheckpointForRecovery = chkConfig.isPreferCheckpointForRecovery(); this.failureManager = checkNotNull(failureManager); this.checkpointPlanCalculator = checkNotNull(checkpointPlanCalculator); this.attemptMappingProvider = checkNotNull(attemptMappingProvider); this.clock = checkNotNull(clock); this.isExactlyOnceMode = chkConfig.isExactlyOnce(); this.unalignedCheckpointsEnabled = chkConfig.isUnalignedCheckpointsEnabled(); this.alignmentTimeout = chkConfig.getAlignmentTimeout(); this.recentPendingCheckpoints = new ArrayDeque<>(NUM_GHOST_CHECKPOINT_IDS); this.masterHooks = new HashMap<>(); this.timer = timer; this.checkpointProperties = CheckpointProperties.forCheckpoint(chkConfig.getCheckpointRetentionPolicy()); try { this.checkpointStorageView = checkpointStorage.createCheckpointStorage(job); checkpointStorageView.initializeBaseLocations(); } catch (IOException e) { throw new FlinkRuntimeException( "Failed to create checkpoint storage at checkpoint coordinator side.", e); } try { // Make sure the checkpoint ID enumerator is running. Possibly // issues a blocking call to ZooKeeper. checkpointIDCounter.start(); } catch (Throwable t) { throw new RuntimeException( "Failed to start checkpoint ID counter: " + t.getMessage(), t); } this.requestDecider = new CheckpointRequestDecider( chkConfig.getMaxConcurrentCheckpoints(), this::rescheduleTrigger, this.clock, this.minPauseBetweenCheckpoints, this.pendingCheckpoints::size, this.checkpointsCleaner::getNumberOfCheckpointsToClean); // TODO: ------------ start:二次开发代码 --------------- // CheckpointCoordinator.coordinator = this; // TODO: ------------ end:二次开发代码 --------------- // } // -------------------------------------------------------------------------------------------- // Configuration // -------------------------------------------------------------------------------------------- /** * Adds the given master hook to the checkpoint coordinator. This method does nothing, if the * checkpoint coordinator already contained a hook with the same ID (as defined via {@link * MasterTriggerRestoreHook#getIdentifier()}). * * @param hook The hook to add. * @return True, if the hook was added, false if the checkpoint coordinator already contained a * hook with the same ID. */ public boolean addMasterHook(MasterTriggerRestoreHook hook) { checkNotNull(hook); final String id = hook.getIdentifier(); checkArgument(!StringUtils.isNullOrWhitespaceOnly(id), "The hook has a null or empty id"); synchronized (lock) { if (!masterHooks.containsKey(id)) { masterHooks.put(id, hook); return true; } else { return false; } } } /** Gets the number of currently register master hooks. */ public int getNumberOfRegisteredMasterHooks() { synchronized (lock) { return masterHooks.size(); } } /** * Sets the checkpoint stats tracker. * * @param statsTracker The checkpoint stats tracker. */ public void setCheckpointStatsTracker(@Nullable CheckpointStatsTracker statsTracker) { this.statsTracker = statsTracker; } // -------------------------------------------------------------------------------------------- // Clean shutdown // -------------------------------------------------------------------------------------------- /** * Shuts down the checkpoint coordinator. * *

After this method has been called, the coordinator does not accept and further messages * and cannot trigger any further checkpoints. */ public void shutdown() throws Exception { synchronized (lock) { if (!shutdown) { shutdown = true; LOG.info("Stopping checkpoint coordinator for job {}.", job); periodicScheduling = false; // shut down the hooks MasterHooks.close(masterHooks.values(), LOG); masterHooks.clear(); final CheckpointException reason = new CheckpointException( CheckpointFailureReason.CHECKPOINT_COORDINATOR_SHUTDOWN); // clear queued requests and in-flight checkpoints abortPendingAndQueuedCheckpoints(reason); } } } public boolean isShutdown() { return shutdown; } // -------------------------------------------------------------------------------------------- // Triggering Checkpoints and Savepoints // -------------------------------------------------------------------------------------------- /** * Triggers a savepoint with the given savepoint directory as a target. * * @param targetLocation Target location for the savepoint, optional. If null, the state * backend's configured default will be used. * @return A future to the completed checkpoint * @throws IllegalStateException If no savepoint directory has been specified and no default * savepoint directory has been configured */ public CompletableFuture triggerSavepoint( @Nullable final String targetLocation) { final CheckpointProperties properties = CheckpointProperties.forSavepoint(!unalignedCheckpointsEnabled); return triggerSavepointInternal(properties, targetLocation); } /** * Triggers a synchronous savepoint with the given savepoint directory as a target. * * @param terminate flag indicating if the job should terminate or just suspend * @param targetLocation Target location for the savepoint, optional. If null, the state * backend's configured default will be used. * @return A future to the completed checkpoint * @throws IllegalStateException If no savepoint directory has been specified and no default * savepoint directory has been configured */ public CompletableFuture triggerSynchronousSavepoint( final boolean terminate, @Nullable final String targetLocation) { final CheckpointProperties properties = CheckpointProperties.forSyncSavepoint(!unalignedCheckpointsEnabled, terminate); return triggerSavepointInternal(properties, targetLocation); } private CompletableFuture triggerSavepointInternal( final CheckpointProperties checkpointProperties, @Nullable final String targetLocation) { checkNotNull(checkpointProperties); // TODO, call triggerCheckpoint directly after removing timer thread // for now, execute the trigger in timer thread to avoid competition final CompletableFuture resultFuture = new CompletableFuture<>(); timer.execute( () -> triggerCheckpoint(checkpointProperties, targetLocation, false) .whenComplete( (completedCheckpoint, throwable) -> { if (throwable == null) { resultFuture.complete(completedCheckpoint); } else { resultFuture.completeExceptionally(throwable); } })); return resultFuture; } /** * Triggers a new standard checkpoint and uses the given timestamp as the checkpoint timestamp. * The return value is a future. It completes when the checkpoint triggered finishes or an error * occurred. * * @param isPeriodic Flag indicating whether this triggered checkpoint is periodic. If this flag * is true, but the periodic scheduler is disabled, the checkpoint will be declined. * @return a future to the completed checkpoint. */ public CompletableFuture triggerCheckpoint(boolean isPeriodic) { return triggerCheckpoint(checkpointProperties, null, isPeriodic); } @VisibleForTesting public CompletableFuture triggerCheckpoint( CheckpointProperties props, @Nullable String externalSavepointLocation, boolean isPeriodic) { if (props.getCheckpointType().getPostCheckpointAction() == PostCheckpointAction.TERMINATE && !(props.isSynchronous() && props.isSavepoint())) { return FutureUtils.completedExceptionally( new IllegalArgumentException( "Only synchronous savepoints are allowed to advance the watermark to MAX.")); } CheckpointTriggerRequest request = new CheckpointTriggerRequest(props, externalSavepointLocation, isPeriodic); chooseRequestToExecute(request).ifPresent(this::startTriggeringCheckpoint); return request.onCompletionPromise; } private void startTriggeringCheckpoint(CheckpointTriggerRequest request) { try { synchronized (lock) { preCheckGlobalState(request.isPeriodic); } // we will actually trigger this checkpoint! Preconditions.checkState(!isTriggering); isTriggering = true; final long timestamp = System.currentTimeMillis(); CompletableFuture checkpointPlanFuture = checkpointPlanCalculator.calculateCheckpointPlan(); final CompletableFuture pendingCheckpointCompletableFuture = checkpointPlanFuture .thenApplyAsync( plan -> { try { CheckpointIdAndStorageLocation checkpointIdAndStorageLocation = initializeCheckpoint( request.props, request.externalSavepointLocation); return new Tuple2<>( plan, checkpointIdAndStorageLocation); } catch (Throwable e) { throw new CompletionException(e); } }, executor) .thenApplyAsync( (checkpointInfo) -> createPendingCheckpoint( timestamp, request.props, checkpointInfo.f0, request.isPeriodic, checkpointInfo.f1.checkpointId, checkpointInfo.f1.checkpointStorageLocation, request.getOnCompletionFuture()), timer); final CompletableFuture coordinatorCheckpointsComplete = pendingCheckpointCompletableFuture.thenComposeAsync( (pendingCheckpoint) -> OperatorCoordinatorCheckpoints .triggerAndAcknowledgeAllCoordinatorCheckpointsWithCompletion( coordinatorsToCheckpoint, pendingCheckpoint, timer), timer); // We have to take the snapshot of the master hooks after the coordinator checkpoints // has completed. // This is to ensure the tasks are checkpointed after the OperatorCoordinators in case // ExternallyInducedSource is used. final CompletableFuture masterStatesComplete = coordinatorCheckpointsComplete.thenComposeAsync( ignored -> { // If the code reaches here, the pending checkpoint is guaranteed to // be not null. // We use FutureUtils.getWithoutException() to make compiler happy // with checked // exceptions in the signature. PendingCheckpoint checkpoint = FutureUtils.getWithoutException( pendingCheckpointCompletableFuture); return snapshotMasterState(checkpoint); }, timer); FutureUtils.assertNoException( CompletableFuture.allOf(masterStatesComplete, coordinatorCheckpointsComplete) .handleAsync( (ignored, throwable) -> { final PendingCheckpoint checkpoint = FutureUtils.getWithoutException( pendingCheckpointCompletableFuture); Preconditions.checkState( checkpoint != null || throwable != null, "Either the pending checkpoint needs to be created or an error must have been occurred."); if (throwable != null) { // the initialization might not be finished yet if (checkpoint == null) { onTriggerFailure(request, throwable); } else { onTriggerFailure(checkpoint, throwable); } } else { if (checkpoint.isDisposed()) { onTriggerFailure( checkpoint, new CheckpointException( CheckpointFailureReason .TRIGGER_CHECKPOINT_FAILURE, checkpoint.getFailureCause())); } else { // no exception, no discarding, everything is OK final long checkpointId = checkpoint.getCheckpointId(); snapshotTaskState( timestamp, checkpointId, checkpoint.getCheckpointStorageLocation(), request.props, checkpoint .getCheckpointPlan() .getTasksToTrigger()); coordinatorsToCheckpoint.forEach( (ctx) -> ctx.afterSourceBarrierInjection( checkpointId)); // It is possible that the tasks has finished // checkpointing at this point. // So we need to complete this pending checkpoint. if (!maybeCompleteCheckpoint(checkpoint)) { return null; } onTriggerSuccess(); } } return null; }, timer) .exceptionally( error -> { if (!isShutdown()) { throw new CompletionException(error); } else if (findThrowable( error, RejectedExecutionException.class) .isPresent()) { LOG.debug("Execution rejected during shutdown"); } else { LOG.warn("Error encountered during shutdown", error); } return null; })); } catch (Throwable throwable) { onTriggerFailure(request, throwable); } } /** * Initialize the checkpoint trigger asynchronously. It will expected to be executed in io * thread due to it might be time-consuming. * * @param props checkpoint properties * @param externalSavepointLocation the external savepoint location, it might be null * @return the initialized result, checkpoint id and checkpoint location */ private CheckpointIdAndStorageLocation initializeCheckpoint( CheckpointProperties props, @Nullable String externalSavepointLocation) throws Exception { // this must happen outside the coordinator-wide lock, because it // communicates // with external services (in HA mode) and may block for a while. long checkpointID = checkpointIdCounter.getAndIncrement(); CheckpointStorageLocation checkpointStorageLocation = props.isSavepoint() ? checkpointStorageView.initializeLocationForSavepoint( checkpointID, externalSavepointLocation) : checkpointStorageView.initializeLocationForCheckpoint(checkpointID); return new CheckpointIdAndStorageLocation(checkpointID, checkpointStorageLocation); } private PendingCheckpoint createPendingCheckpoint( long timestamp, CheckpointProperties props, CheckpointPlan checkpointPlan, boolean isPeriodic, long checkpointID, CheckpointStorageLocation checkpointStorageLocation, CompletableFuture onCompletionPromise) { synchronized (lock) { try { // since we haven't created the PendingCheckpoint yet, we need to check the // global state here. preCheckGlobalState(isPeriodic); } catch (Throwable t) { throw new CompletionException(t); } } final PendingCheckpoint checkpoint = new PendingCheckpoint( job, checkpointID, timestamp, checkpointPlan, OperatorInfo.getIds(coordinatorsToCheckpoint), masterHooks.keySet(), props, checkpointStorageLocation, onCompletionPromise); trackPendingCheckpointStats(checkpoint); synchronized (lock) { pendingCheckpoints.put(checkpointID, checkpoint); ScheduledFuture cancellerHandle = timer.schedule( new CheckpointCanceller(checkpoint), checkpointTimeout, TimeUnit.MILLISECONDS); if (!checkpoint.setCancellerHandle(cancellerHandle)) { // checkpoint is already disposed! cancellerHandle.cancel(false); } } LOG.info( "Triggering checkpoint {} (type={}) @ {} for job {}.", checkpointID, checkpoint.getProps().getCheckpointType(), timestamp, job); return checkpoint; } /** * Snapshot master hook states asynchronously. * * @param checkpoint the pending checkpoint * @return the future represents master hook states are finished or not */ private CompletableFuture snapshotMasterState(PendingCheckpoint checkpoint) { if (masterHooks.isEmpty()) { return CompletableFuture.completedFuture(null); } final long checkpointID = checkpoint.getCheckpointId(); final long timestamp = checkpoint.getCheckpointTimestamp(); final CompletableFuture masterStateCompletableFuture = new CompletableFuture<>(); for (MasterTriggerRestoreHook masterHook : masterHooks.values()) { MasterHooks.triggerHook(masterHook, checkpointID, timestamp, executor) .whenCompleteAsync( (masterState, throwable) -> { try { synchronized (lock) { if (masterStateCompletableFuture.isDone()) { return; } if (checkpoint.isDisposed()) { throw new IllegalStateException( "Checkpoint " + checkpointID + " has been discarded"); } if (throwable == null) { checkpoint.acknowledgeMasterState( masterHook.getIdentifier(), masterState); if (checkpoint.areMasterStatesFullyAcknowledged()) { masterStateCompletableFuture.complete(null); } } else { masterStateCompletableFuture.completeExceptionally( throwable); } } } catch (Throwable t) { masterStateCompletableFuture.completeExceptionally(t); } }, timer); } return masterStateCompletableFuture; } /** * Snapshot task state. * * @param timestamp the timestamp of this checkpoint reques * @param checkpointID the checkpoint id * @param checkpointStorageLocation the checkpoint location * @param props the checkpoint properties * @param tasksToTrigger the executions which should be triggered */ private void snapshotTaskState( long timestamp, long checkpointID, CheckpointStorageLocation checkpointStorageLocation, CheckpointProperties props, List tasksToTrigger) { final CheckpointOptions checkpointOptions = CheckpointOptions.forConfig( props.getCheckpointType(), checkpointStorageLocation.getLocationReference(), isExactlyOnceMode, unalignedCheckpointsEnabled, alignmentTimeout); // send the messages to the tasks that trigger their checkpoint for (Execution execution : tasksToTrigger) { if (props.isSynchronous()) { execution.triggerSynchronousSavepoint(checkpointID, timestamp, checkpointOptions); } else { execution.triggerCheckpoint(checkpointID, timestamp, checkpointOptions); } } } /** Trigger request is successful. NOTE, it must be invoked if trigger request is successful. */ private void onTriggerSuccess() { isTriggering = false; numUnsuccessfulCheckpointsTriggers.set(0); executeQueuedRequest(); } /** * The trigger request is failed prematurely without a proper initialization. There is no * resource to release, but the completion promise needs to fail manually here. * * @param onCompletionPromise the completion promise of the checkpoint/savepoint * @param throwable the reason of trigger failure */ private void onTriggerFailure( CheckpointTriggerRequest onCompletionPromise, Throwable throwable) { final CheckpointException checkpointException = getCheckpointException( CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE, throwable); onCompletionPromise.completeExceptionally(checkpointException); onTriggerFailure((PendingCheckpoint) null, checkpointException); } /** * The trigger request is failed. NOTE, it must be invoked if trigger request is failed. * * @param checkpoint the pending checkpoint which is failed. It could be null if it's failed * prematurely without a proper initialization. * @param throwable the reason of trigger failure */ private void onTriggerFailure(@Nullable PendingCheckpoint checkpoint, Throwable throwable) { // beautify the stack trace a bit throwable = ExceptionUtils.stripCompletionException(throwable); try { coordinatorsToCheckpoint.forEach( OperatorCoordinatorCheckpointContext::abortCurrentTriggering); if (checkpoint != null && !checkpoint.isDisposed()) { int numUnsuccessful = numUnsuccessfulCheckpointsTriggers.incrementAndGet(); LOG.warn( "Failed to trigger checkpoint {} for job {}. ({} consecutive failed attempts so far)", checkpoint.getCheckpointId(), job, numUnsuccessful, throwable); final CheckpointException cause = getCheckpointException( CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE, throwable); synchronized (lock) { abortPendingCheckpoint(checkpoint, cause); } } else { LOG.info( "Failed to trigger checkpoint for job {} since {}", job, throwable.getMessage()); } } finally { isTriggering = false; executeQueuedRequest(); } } private void executeQueuedRequest() { chooseQueuedRequestToExecute().ifPresent(this::startTriggeringCheckpoint); } private Optional chooseQueuedRequestToExecute() { synchronized (lock) { return requestDecider.chooseQueuedRequestToExecute( isTriggering, lastCheckpointCompletionRelativeTime); } } private Optional chooseRequestToExecute( CheckpointTriggerRequest request) { synchronized (lock) { return requestDecider.chooseRequestToExecute( request, isTriggering, lastCheckpointCompletionRelativeTime); } } // Returns true if the checkpoint is successfully completed, false otherwise. private boolean maybeCompleteCheckpoint(PendingCheckpoint checkpoint) { synchronized (lock) { if (checkpoint.isFullyAcknowledged()) { try { // we need to check inside the lock for being shutdown as well, // otherwise we get races and invalid error log messages. if (shutdown) { return false; } completePendingCheckpoint(checkpoint); } catch (CheckpointException ce) { onTriggerFailure(checkpoint, ce); return false; } } } return true; } // -------------------------------------------------------------------------------------------- // Handling checkpoints and messages // -------------------------------------------------------------------------------------------- /** * Receives a {@link DeclineCheckpoint} message for a pending checkpoint. * * @param message Checkpoint decline from the task manager * @param taskManagerLocationInfo The location info of the decline checkpoint message's sender */ public void receiveDeclineMessage(DeclineCheckpoint message, String taskManagerLocationInfo) { if (shutdown || message == null) { return; } if (!job.equals(message.getJob())) { throw new IllegalArgumentException( "Received DeclineCheckpoint message for job " + message.getJob() + " from " + taskManagerLocationInfo + " while this coordinator handles job " + job); } final long checkpointId = message.getCheckpointId(); final CheckpointException checkpointException = message.getSerializedCheckpointException().unwrap(); final String reason = checkpointException.getMessage(); PendingCheckpoint checkpoint; synchronized (lock) { // we need to check inside the lock for being shutdown as well, otherwise we // get races and invalid error log messages if (shutdown) { return; } checkpoint = pendingCheckpoints.get(checkpointId); if (checkpoint != null) { Preconditions.checkState( !checkpoint.isDisposed(), "Received message for discarded but non-removed checkpoint " + checkpointId); LOG.info( "Decline checkpoint {} by task {} of job {} at {}.", checkpointId, message.getTaskExecutionId(), job, taskManagerLocationInfo, checkpointException.getCause()); abortPendingCheckpoint( checkpoint, checkpointException, message.getTaskExecutionId()); } else if (LOG.isDebugEnabled()) { if (recentPendingCheckpoints.contains(checkpointId)) { // message is for an unknown checkpoint, or comes too late (checkpoint disposed) LOG.debug( "Received another decline message for now expired checkpoint attempt {} from task {} of job {} at {} : {}", checkpointId, message.getTaskExecutionId(), job, taskManagerLocationInfo, reason); } else { // message is for an unknown checkpoint. might be so old that we don't even // remember it any more LOG.debug( "Received decline message for unknown (too old?) checkpoint attempt {} from task {} of job {} at {} : {}", checkpointId, message.getTaskExecutionId(), job, taskManagerLocationInfo, reason); } } } } /** * Receives an AcknowledgeCheckpoint message and returns whether the message was associated with * a pending checkpoint. * * @param message Checkpoint ack from the task manager * @param taskManagerLocationInfo The location of the acknowledge checkpoint message's sender * @return Flag indicating whether the ack'd checkpoint was associated with a pending * checkpoint. * @throws CheckpointException If the checkpoint cannot be added to the completed checkpoint * store. */ public boolean receiveAcknowledgeMessage( AcknowledgeCheckpoint message, String taskManagerLocationInfo) throws CheckpointException { if (shutdown || message == null) { return false; } if (!job.equals(message.getJob())) { LOG.error( "Received wrong AcknowledgeCheckpoint message for job {} from {} : {}", job, taskManagerLocationInfo, message); return false; } final long checkpointId = message.getCheckpointId(); synchronized (lock) { // we need to check inside the lock for being shutdown as well, otherwise we // get races and invalid error log messages if (shutdown) { return false; } final PendingCheckpoint checkpoint = pendingCheckpoints.get(checkpointId); if (checkpoint != null && !checkpoint.isDisposed()) { switch (checkpoint.acknowledgeTask( message.getTaskExecutionId(), message.getSubtaskState(), message.getCheckpointMetrics(), getStatsCallback(checkpoint))) { case SUCCESS: LOG.debug( "Received acknowledge message for checkpoint {} from task {} of job {} at {}.", checkpointId, message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); if (checkpoint.isFullyAcknowledged()) { completePendingCheckpoint(checkpoint); } break; case DUPLICATE: LOG.debug( "Received a duplicate acknowledge message for checkpoint {}, task {}, job {}, location {}.", message.getCheckpointId(), message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); break; case UNKNOWN: LOG.warn( "Could not acknowledge the checkpoint {} for task {} of job {} at {}, " + "because the task's execution attempt id was unknown. Discarding " + "the state handle to avoid lingering state.", message.getCheckpointId(), message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); discardSubtaskState( message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState()); break; case DISCARDED: LOG.warn( "Could not acknowledge the checkpoint {} for task {} of job {} at {}, " + "because the pending checkpoint had been discarded. Discarding the " + "state handle tp avoid lingering state.", message.getCheckpointId(), message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); discardSubtaskState( message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState()); } return true; } else if (checkpoint != null) { // this should not happen throw new IllegalStateException( "Received message for discarded but non-removed checkpoint " + checkpointId); } else { reportStats( message.getCheckpointId(), message.getTaskExecutionId(), message.getCheckpointMetrics()); boolean wasPendingCheckpoint; // message is for an unknown checkpoint, or comes too late (checkpoint disposed) if (recentPendingCheckpoints.contains(checkpointId)) { wasPendingCheckpoint = true; LOG.warn( "Received late message for now expired checkpoint attempt {} from task " + "{} of job {} at {}.", checkpointId, message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); } else { LOG.debug( "Received message for an unknown checkpoint {} from task {} of job {} at {}.", checkpointId, message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); wasPendingCheckpoint = false; } // try to discard the state so that we don't have lingering state lying around discardSubtaskState( message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState()); return wasPendingCheckpoint; } } } /** * Try to complete the given pending checkpoint. * *

Important: This method should only be called in the checkpoint lock scope. * * @param pendingCheckpoint to complete * @throws CheckpointException if the completion failed */ private void completePendingCheckpoint(PendingCheckpoint pendingCheckpoint) throws CheckpointException { final long checkpointId = pendingCheckpoint.getCheckpointId(); final CompletedCheckpoint completedCheckpoint; // As a first step to complete the checkpoint, we register its state with the registry Map operatorStates = pendingCheckpoint.getOperatorStates(); sharedStateRegistry.registerAll(operatorStates.values()); try { try { completedCheckpoint = pendingCheckpoint.finalizeCheckpoint( checkpointsCleaner, this::scheduleTriggerRequest, executor, getStatsCallback(pendingCheckpoint)); failureManager.handleCheckpointSuccess(pendingCheckpoint.getCheckpointId()); } catch (Exception e1) { // abort the current pending checkpoint if we fails to finalize the pending // checkpoint. if (!pendingCheckpoint.isDisposed()) { abortPendingCheckpoint( pendingCheckpoint, new CheckpointException( CheckpointFailureReason.FINALIZE_CHECKPOINT_FAILURE, e1)); } throw new CheckpointException( "Could not finalize the pending checkpoint " + checkpointId + '.', CheckpointFailureReason.FINALIZE_CHECKPOINT_FAILURE, e1); } // the pending checkpoint must be discarded after the finalization Preconditions.checkState(pendingCheckpoint.isDisposed() && completedCheckpoint != null); try { completedCheckpointStore.addCheckpoint( completedCheckpoint, checkpointsCleaner, this::scheduleTriggerRequest); } catch (Exception exception) { // we failed to store the completed checkpoint. Let's clean up executor.execute( new Runnable() { @Override public void run() { try { completedCheckpoint.discardOnFailedStoring(); } catch (Throwable t) { LOG.warn( "Could not properly discard completed checkpoint {}.", completedCheckpoint.getCheckpointID(), t); } } }); sendAbortedMessages( pendingCheckpoint.getCheckpointPlan().getTasksToCommitTo(), checkpointId, pendingCheckpoint.getCheckpointTimestamp()); throw new CheckpointException( "Could not complete the pending checkpoint " + checkpointId + '.', CheckpointFailureReason.FINALIZE_CHECKPOINT_FAILURE, exception); } } finally { pendingCheckpoints.remove(checkpointId); scheduleTriggerRequest(); } rememberRecentCheckpointId(checkpointId); // drop those pending checkpoints that are at prior to the completed one dropSubsumedCheckpoints(checkpointId); // record the time when this was completed, to calculate // the 'min delay between checkpoints' lastCheckpointCompletionRelativeTime = clock.relativeTimeMillis(); LOG.info( "Completed checkpoint {} for job {} ({} bytes in {} ms).", checkpointId, job, completedCheckpoint.getStateSize(), completedCheckpoint.getDuration()); if (LOG.isDebugEnabled()) { StringBuilder builder = new StringBuilder(); builder.append("Checkpoint state: "); for (OperatorState state : completedCheckpoint.getOperatorStates().values()) { builder.append(state); builder.append(", "); } // Remove last two chars ", " builder.setLength(builder.length() - 2); LOG.debug(builder.toString()); } // send the "notify complete" call to all vertices, coordinators, etc. sendAcknowledgeMessages( pendingCheckpoint.getCheckpointPlan().getTasksToCommitTo(), checkpointId, completedCheckpoint.getTimestamp()); } void scheduleTriggerRequest() { synchronized (lock) { if (isShutdown()) { LOG.debug( "Skip scheduling trigger request because the CheckpointCoordinator is shut down"); } else { timer.execute(this::executeQueuedRequest); } } } private void sendAcknowledgeMessages( List tasksToCommit, long checkpointId, long timestamp) { // commit tasks for (ExecutionVertex ev : tasksToCommit) { Execution ee = ev.getCurrentExecutionAttempt(); if (ee != null) { ee.notifyCheckpointComplete(checkpointId, timestamp); } } // commit coordinators for (OperatorCoordinatorCheckpointContext coordinatorContext : coordinatorsToCheckpoint) { coordinatorContext.notifyCheckpointComplete(checkpointId); } } private void sendAbortedMessages( List tasksToAbort, long checkpointId, long timeStamp) { // send notification of aborted checkpoints asynchronously. executor.execute( () -> { // send the "abort checkpoint" messages to necessary vertices. for (ExecutionVertex ev : tasksToAbort) { Execution ee = ev.getCurrentExecutionAttempt(); if (ee != null) { ee.notifyCheckpointAborted(checkpointId, timeStamp); } } }); // commit coordinators for (OperatorCoordinatorCheckpointContext coordinatorContext : coordinatorsToCheckpoint) { coordinatorContext.notifyCheckpointAborted(checkpointId); } } /** * Fails all pending checkpoints which have not been acknowledged by the given execution attempt * id. * * @param executionAttemptId for which to discard unacknowledged pending checkpoints * @param cause of the failure */ public void failUnacknowledgedPendingCheckpointsFor( ExecutionAttemptID executionAttemptId, Throwable cause) { synchronized (lock) { abortPendingCheckpoints( checkpoint -> !checkpoint.isAcknowledgedBy(executionAttemptId), new CheckpointException(CheckpointFailureReason.TASK_FAILURE, cause)); } } private void rememberRecentCheckpointId(long id) { if (recentPendingCheckpoints.size() >= NUM_GHOST_CHECKPOINT_IDS) { recentPendingCheckpoints.removeFirst(); } recentPendingCheckpoints.addLast(id); } private void dropSubsumedCheckpoints(long checkpointId) { abortPendingCheckpoints( checkpoint -> checkpoint.getCheckpointId() < checkpointId && checkpoint.canBeSubsumed(), new CheckpointException(CheckpointFailureReason.CHECKPOINT_SUBSUMED)); } // -------------------------------------------------------------------------------------------- // Checkpoint State Restoring // -------------------------------------------------------------------------------------------- /** * Restores the latest checkpointed state. * * @param tasks Map of job vertices to restore. State for these vertices is restored via {@link * Execution#setInitialState(JobManagerTaskRestore)}. * @param errorIfNoCheckpoint Fail if no completed checkpoint is available to restore from. * @param allowNonRestoredState Allow checkpoint state that cannot be mapped to any job vertex * in tasks. * @return true if state was restored, false otherwise. * @throws IllegalStateException If the CheckpointCoordinator is shut down. * @throws IllegalStateException If no completed checkpoint is available and the * failIfNoCheckpoint flag has been set. * @throws IllegalStateException If the checkpoint contains state that cannot be mapped to any * job vertex in tasks and the allowNonRestoredState flag has not * been set. * @throws IllegalStateException If the max parallelism changed for an operator that restores * state from this checkpoint. * @throws IllegalStateException If the parallelism changed for an operator that restores * non-partitioned state from this checkpoint. */ @Deprecated public boolean restoreLatestCheckpointedState( Map tasks, boolean errorIfNoCheckpoint, boolean allowNonRestoredState) throws Exception { final OptionalLong restoredCheckpointId = restoreLatestCheckpointedStateInternal( new HashSet<>(tasks.values()), OperatorCoordinatorRestoreBehavior.RESTORE_OR_RESET, errorIfNoCheckpoint, allowNonRestoredState); return restoredCheckpointId.isPresent(); } /** * Restores the latest checkpointed state to a set of subtasks. This method represents a "local" * or "regional" failover and does restore states to coordinators. Note that a regional failover * might still include all tasks. * * @param tasks Set of job vertices to restore. State for these vertices is restored via {@link * Execution#setInitialState(JobManagerTaskRestore)}. * @return An {@code OptionalLong} with the checkpoint ID, if state was restored, an empty * {@code OptionalLong} otherwise. * @throws IllegalStateException If the CheckpointCoordinator is shut down. * @throws IllegalStateException If no completed checkpoint is available and the * failIfNoCheckpoint flag has been set. * @throws IllegalStateException If the checkpoint contains state that cannot be mapped to any * job vertex in tasks and the allowNonRestoredState flag has not * been set. * @throws IllegalStateException If the max parallelism changed for an operator that restores * state from this checkpoint. * @throws IllegalStateException If the parallelism changed for an operator that restores * non-partitioned state from this checkpoint. */ public OptionalLong restoreLatestCheckpointedStateToSubtasks( final Set tasks) throws Exception { // when restoring subtasks only we accept potentially unmatched state for the // following reasons // - the set frequently does not include all Job Vertices (only the ones that are part // of the restarted region), meaning there will be unmatched state by design. // - because what we might end up restoring from an original savepoint with unmatched // state, if there is was no checkpoint yet. return restoreLatestCheckpointedStateInternal( tasks, OperatorCoordinatorRestoreBehavior .SKIP, // local/regional recovery does not reset coordinators false, // recovery might come before first successful checkpoint true); // see explanation above } /** * Restores the latest checkpointed state to all tasks and all coordinators. This method * represents a "global restore"-style operation where all stateful tasks and coordinators from * the given set of Job Vertices are restored. are restored to their latest checkpointed state. * * @param tasks Set of job vertices to restore. State for these vertices is restored via {@link * Execution#setInitialState(JobManagerTaskRestore)}. * @param allowNonRestoredState Allow checkpoint state that cannot be mapped to any job vertex * in tasks. * @return true if state was restored, false otherwise. * @throws IllegalStateException If the CheckpointCoordinator is shut down. * @throws IllegalStateException If no completed checkpoint is available and the * failIfNoCheckpoint flag has been set. * @throws IllegalStateException If the checkpoint contains state that cannot be mapped to any * job vertex in tasks and the allowNonRestoredState flag has not * been set. * @throws IllegalStateException If the max parallelism changed for an operator that restores * state from this checkpoint. * @throws IllegalStateException If the parallelism changed for an operator that restores * non-partitioned state from this checkpoint. */ public boolean restoreLatestCheckpointedStateToAll( final Set tasks, final boolean allowNonRestoredState) throws Exception { final OptionalLong restoredCheckpointId = restoreLatestCheckpointedStateInternal( tasks, OperatorCoordinatorRestoreBehavior .RESTORE_OR_RESET, // global recovery restores coordinators, or // resets them to empty false, // recovery might come before first successful checkpoint allowNonRestoredState); return restoredCheckpointId.isPresent(); } /** * Restores the latest checkpointed at the beginning of the job execution. If there is a * checkpoint, this method acts like a "global restore"-style operation where all stateful tasks * and coordinators from the given set of Job Vertices are restored. * * @param tasks Set of job vertices to restore. State for these vertices is restored via {@link * Execution#setInitialState(JobManagerTaskRestore)}. * @return True, if a checkpoint was found and its state was restored, false otherwise. */ public boolean restoreInitialCheckpointIfPresent(final Set tasks) throws Exception { final OptionalLong restoredCheckpointId = restoreLatestCheckpointedStateInternal( tasks, OperatorCoordinatorRestoreBehavior.RESTORE_IF_CHECKPOINT_PRESENT, false, // initial checkpoints exist only on JobManager failover. ok if not // present. false); // JobManager failover means JobGraphs match exactly. return restoredCheckpointId.isPresent(); } /** * Performs the actual restore operation to the given tasks. * *

This method returns the restored checkpoint ID (as an optional) or an empty optional, if * no checkpoint was restored. */ private OptionalLong restoreLatestCheckpointedStateInternal( final Set tasks, final OperatorCoordinatorRestoreBehavior operatorCoordinatorRestoreBehavior, final boolean errorIfNoCheckpoint, final boolean allowNonRestoredState) throws Exception { synchronized (lock) { if (shutdown) { throw new IllegalStateException("CheckpointCoordinator is shut down"); } // We create a new shared state registry object, so that all pending async disposal // requests from previous // runs will go against the old object (were they can do no harm). // This must happen under the checkpoint lock. sharedStateRegistry.close(); sharedStateRegistry = sharedStateRegistryFactory.create(executor); // Recover the checkpoints, TODO this could be done only when there is a new leader, not // on each recovery completedCheckpointStore.recover(); // Now, we re-register all (shared) states from the checkpoint store with the new // registry for (CompletedCheckpoint completedCheckpoint : completedCheckpointStore.getAllCheckpoints()) { completedCheckpoint.registerSharedStatesAfterRestored(sharedStateRegistry); } LOG.debug( "Status of the shared state registry of job {} after restore: {}.", job, sharedStateRegistry); // Restore from the latest checkpoint CompletedCheckpoint latest = completedCheckpointStore.getLatestCheckpoint(isPreferCheckpointForRecovery); if (latest == null) { LOG.info("No checkpoint found during restore."); if (errorIfNoCheckpoint) { throw new IllegalStateException("No completed checkpoint available"); } LOG.debug("Resetting the master hooks."); MasterHooks.reset(masterHooks.values(), LOG); if (operatorCoordinatorRestoreBehavior == OperatorCoordinatorRestoreBehavior.RESTORE_OR_RESET) { // we let the JobManager-side components know that there was a recovery, // even if there was no checkpoint to recover from, yet LOG.info("Resetting the Operator Coordinators to an empty state."); restoreStateToCoordinators( OperatorCoordinator.NO_CHECKPOINT, Collections.emptyMap()); } return OptionalLong.empty(); } LOG.info("Restoring job {} from {}.", job, latest); // re-assign the task states final Map operatorStates = latest.getOperatorStates(); StateAssignmentOperation stateAssignmentOperation = new StateAssignmentOperation( latest.getCheckpointID(), tasks, operatorStates, allowNonRestoredState); stateAssignmentOperation.assignStates(); // call master hooks for restore. we currently call them also on "regional restore" // because // there is no other failure notification mechanism in the master hooks // ultimately these should get removed anyways in favor of the operator coordinators MasterHooks.restoreMasterHooks( masterHooks, latest.getMasterHookStates(), latest.getCheckpointID(), allowNonRestoredState, LOG); if (operatorCoordinatorRestoreBehavior != OperatorCoordinatorRestoreBehavior.SKIP) { restoreStateToCoordinators(latest.getCheckpointID(), operatorStates); } // update metrics if (statsTracker != null) { long restoreTimestamp = System.currentTimeMillis(); RestoredCheckpointStats restored = new RestoredCheckpointStats( latest.getCheckpointID(), latest.getProperties(), restoreTimestamp, latest.getExternalPointer()); statsTracker.reportRestoredCheckpoint(restored); } return OptionalLong.of(latest.getCheckpointID()); } } /** * Restore the state with given savepoint. * * @param savepointPointer The pointer to the savepoint. * @param allowNonRestored True if allowing checkpoint state that cannot be mapped to any job * vertex in tasks. * @param tasks Map of job vertices to restore. State for these vertices is restored via {@link * Execution#setInitialState(JobManagerTaskRestore)}. * @param userClassLoader The class loader to resolve serialized classes in legacy savepoint * versions. */ public boolean restoreSavepoint( String savepointPointer, boolean allowNonRestored, Map tasks, ClassLoader userClassLoader) throws Exception { Preconditions.checkNotNull(savepointPointer, "The savepoint path cannot be null."); LOG.info( "Starting job {} from savepoint {} ({})", job, savepointPointer, (allowNonRestored ? "allowing non restored state" : "")); final CompletedCheckpointStorageLocation checkpointLocation = checkpointStorageView.resolveCheckpoint(savepointPointer); // Load the savepoint as a checkpoint into the system CompletedCheckpoint savepoint = Checkpoints.loadAndValidateCheckpoint( job, tasks, checkpointLocation, userClassLoader, allowNonRestored); completedCheckpointStore.addCheckpoint( savepoint, checkpointsCleaner, this::scheduleTriggerRequest); // Reset the checkpoint ID counter long nextCheckpointId = savepoint.getCheckpointID() + 1; checkpointIdCounter.setCount(nextCheckpointId); LOG.info("Reset the checkpoint ID of job {} to {}.", job, nextCheckpointId); final OptionalLong restoredCheckpointId = restoreLatestCheckpointedStateInternal( new HashSet<>(tasks.values()), OperatorCoordinatorRestoreBehavior.RESTORE_IF_CHECKPOINT_PRESENT, true, allowNonRestored); return restoredCheckpointId.isPresent(); } // ------------------------------------------------------------------------ // Accessors // ------------------------------------------------------------------------ public int getNumberOfPendingCheckpoints() { synchronized (lock) { return this.pendingCheckpoints.size(); } } public int getNumberOfRetainedSuccessfulCheckpoints() { synchronized (lock) { return completedCheckpointStore.getNumberOfRetainedCheckpoints(); } } public Map getPendingCheckpoints() { synchronized (lock) { return new HashMap<>(this.pendingCheckpoints); } } public List getSuccessfulCheckpoints() throws Exception { synchronized (lock) { return completedCheckpointStore.getAllCheckpoints(); } } public CheckpointStorageCoordinatorView getCheckpointStorage() { return checkpointStorageView; } public CompletedCheckpointStore getCheckpointStore() { return completedCheckpointStore; } public long getCheckpointTimeout() { return checkpointTimeout; } /** @deprecated use {@link #getNumQueuedRequests()} */ @Deprecated @VisibleForTesting PriorityQueue getTriggerRequestQueue() { synchronized (lock) { return requestDecider.getTriggerRequestQueue(); } } public boolean isTriggering() { return isTriggering; } @VisibleForTesting boolean isCurrentPeriodicTriggerAvailable() { return currentPeriodicTrigger != null; } /** * Returns whether periodic checkpointing has been configured. * * @return true if periodic checkpoints have been configured. */ public boolean isPeriodicCheckpointingConfigured() { return baseInterval != Long.MAX_VALUE; } // -------------------------------------------------------------------------------------------- // Periodic scheduling of checkpoints // -------------------------------------------------------------------------------------------- public void startCheckpointScheduler() { synchronized (lock) { if (shutdown) { throw new IllegalArgumentException("Checkpoint coordinator is shut down"); } Preconditions.checkState( isPeriodicCheckpointingConfigured(), "Can not start checkpoint scheduler, if no periodic checkpointing is configured"); // make sure all prior timers are cancelled stopCheckpointScheduler(); periodicScheduling = true; currentPeriodicTrigger = scheduleTriggerWithDelay(getRandomInitDelay()); } } public void stopCheckpointScheduler() { synchronized (lock) { periodicScheduling = false; cancelPeriodicTrigger(); final CheckpointException reason = new CheckpointException(CheckpointFailureReason.CHECKPOINT_COORDINATOR_SUSPEND); abortPendingAndQueuedCheckpoints(reason); numUnsuccessfulCheckpointsTriggers.set(0); } } public boolean isPeriodicCheckpointingStarted() { return periodicScheduling; } /** * Aborts all the pending checkpoints due to en exception. * * @param exception The exception. */ public void abortPendingCheckpoints(CheckpointException exception) { synchronized (lock) { abortPendingCheckpoints(ignored -> true, exception); } } private void abortPendingCheckpoints( Predicate checkpointToFailPredicate, CheckpointException exception) { assert Thread.holdsLock(lock); final PendingCheckpoint[] pendingCheckpointsToFail = pendingCheckpoints.values().stream() .filter(checkpointToFailPredicate) .toArray(PendingCheckpoint[]::new); // do not traverse pendingCheckpoints directly, because it might be changed during // traversing for (PendingCheckpoint pendingCheckpoint : pendingCheckpointsToFail) { abortPendingCheckpoint(pendingCheckpoint, exception); } } private void rescheduleTrigger(long tillNextMillis) { cancelPeriodicTrigger(); currentPeriodicTrigger = scheduleTriggerWithDelay(tillNextMillis); } private void cancelPeriodicTrigger() { if (currentPeriodicTrigger != null) { currentPeriodicTrigger.cancel(false); currentPeriodicTrigger = null; } } private long getRandomInitDelay() { return ThreadLocalRandom.current().nextLong(minPauseBetweenCheckpoints, baseInterval + 1L); } private ScheduledFuture scheduleTriggerWithDelay(long initDelay) { return timer.scheduleAtFixedRate( new ScheduledTrigger(), initDelay, baseInterval, TimeUnit.MILLISECONDS); } private void restoreStateToCoordinators( final long checkpointId, final Map operatorStates) throws Exception { for (OperatorCoordinatorCheckpointContext coordContext : coordinatorsToCheckpoint) { final OperatorState state = operatorStates.get(coordContext.operatorId()); final ByteStreamStateHandle coordinatorState = state == null ? null : state.getCoordinatorState(); final byte[] bytes = coordinatorState == null ? null : coordinatorState.getData(); coordContext.resetToCheckpoint(checkpointId, bytes); } } // ------------------------------------------------------------------------ // job status listener that schedules / cancels periodic checkpoints // ------------------------------------------------------------------------ public JobStatusListener createActivatorDeactivator() { synchronized (lock) { if (shutdown) { throw new IllegalArgumentException("Checkpoint coordinator is shut down"); } if (jobStatusListener == null) { jobStatusListener = new CheckpointCoordinatorDeActivator(this); } return jobStatusListener; } } int getNumQueuedRequests() { synchronized (lock) { return requestDecider.getNumQueuedRequests(); } } public void reportStats(long id, ExecutionAttemptID attemptId, CheckpointMetrics metrics) throws CheckpointException { if (statsTracker != null) { attemptMappingProvider .getVertex(attemptId) .ifPresent(ev -> statsTracker.reportIncompleteStats(id, ev, metrics)); } } // ------------------------------------------------------------------------ private final class ScheduledTrigger implements Runnable { @Override public void run() { try { triggerCheckpoint(true); } catch (Exception e) { LOG.error("Exception while triggering checkpoint for job {}.", job, e); } } } /** * Discards the given state object asynchronously belonging to the given job, execution attempt * id and checkpoint id. * * @param jobId identifying the job to which the state object belongs * @param executionAttemptID identifying the task to which the state object belongs * @param checkpointId of the state object * @param subtaskState to discard asynchronously */ private void discardSubtaskState( final JobID jobId, final ExecutionAttemptID executionAttemptID, final long checkpointId, final TaskStateSnapshot subtaskState) { if (subtaskState != null) { executor.execute( new Runnable() { @Override public void run() { try { subtaskState.discardState(); } catch (Throwable t2) { LOG.warn( "Could not properly discard state object of checkpoint {} " + "belonging to task {} of job {}.", checkpointId, executionAttemptID, jobId, t2); } } }); } } private void abortPendingCheckpoint( PendingCheckpoint pendingCheckpoint, CheckpointException exception) { abortPendingCheckpoint(pendingCheckpoint, exception, null); } private void abortPendingCheckpoint( PendingCheckpoint pendingCheckpoint, CheckpointException exception, @Nullable final ExecutionAttemptID executionAttemptID) { assert (Thread.holdsLock(lock)); if (!pendingCheckpoint.isDisposed()) { try { // release resource here pendingCheckpoint.abort( exception.getCheckpointFailureReason(), exception.getCause(), checkpointsCleaner, this::scheduleTriggerRequest, executor, getStatsCallback(pendingCheckpoint)); if (pendingCheckpoint.getProps().isSavepoint() && pendingCheckpoint.getProps().isSynchronous()) { failureManager.handleSynchronousSavepointFailure(exception); } else if (executionAttemptID != null) { failureManager.handleTaskLevelCheckpointException( exception, pendingCheckpoint.getCheckpointId(), executionAttemptID); } else { failureManager.handleJobLevelCheckpointException( exception, pendingCheckpoint.getCheckpointId()); } } finally { sendAbortedMessages( pendingCheckpoint.getCheckpointPlan().getTasksToCommitTo(), pendingCheckpoint.getCheckpointId(), pendingCheckpoint.getCheckpointTimestamp()); pendingCheckpoints.remove(pendingCheckpoint.getCheckpointId()); rememberRecentCheckpointId(pendingCheckpoint.getCheckpointId()); scheduleTriggerRequest(); } } } private void preCheckGlobalState(boolean isPeriodic) throws CheckpointException { // abort if the coordinator has been shutdown in the meantime if (shutdown) { throw new CheckpointException(CheckpointFailureReason.CHECKPOINT_COORDINATOR_SHUTDOWN); } // Don't allow periodic checkpoint if scheduling has been disabled if (isPeriodic && !periodicScheduling) { throw new CheckpointException(CheckpointFailureReason.PERIODIC_SCHEDULER_SHUTDOWN); } } private void abortPendingAndQueuedCheckpoints(CheckpointException exception) { assert (Thread.holdsLock(lock)); requestDecider.abortAll(exception); abortPendingCheckpoints(exception); } /** * The canceller of checkpoint. The checkpoint might be cancelled if it doesn't finish in a * configured period. */ private class CheckpointCanceller implements Runnable { private final PendingCheckpoint pendingCheckpoint; private CheckpointCanceller(PendingCheckpoint pendingCheckpoint) { this.pendingCheckpoint = checkNotNull(pendingCheckpoint); } @Override public void run() { synchronized (lock) { // only do the work if the checkpoint is not discarded anyways // note that checkpoint completion discards the pending checkpoint object if (!pendingCheckpoint.isDisposed()) { LOG.info( "Checkpoint {} of job {} expired before completing.", pendingCheckpoint.getCheckpointId(), job); abortPendingCheckpoint( pendingCheckpoint, new CheckpointException(CheckpointFailureReason.CHECKPOINT_EXPIRED)); } } } } private static CheckpointException getCheckpointException( CheckpointFailureReason defaultReason, Throwable throwable) { final Optional checkpointExceptionOptional = findThrowable(throwable, CheckpointException.class); return checkpointExceptionOptional.orElseGet( () -> new CheckpointException(defaultReason, throwable)); } private static class CheckpointIdAndStorageLocation { private final long checkpointId; private final CheckpointStorageLocation checkpointStorageLocation; CheckpointIdAndStorageLocation( long checkpointId, CheckpointStorageLocation checkpointStorageLocation) { this.checkpointId = checkpointId; this.checkpointStorageLocation = checkNotNull(checkpointStorageLocation); } } static class CheckpointTriggerRequest { final long timestamp; final CheckpointProperties props; final @Nullable String externalSavepointLocation; final boolean isPeriodic; private final CompletableFuture onCompletionPromise = new CompletableFuture<>(); CheckpointTriggerRequest( CheckpointProperties props, @Nullable String externalSavepointLocation, boolean isPeriodic) { this.timestamp = System.currentTimeMillis(); this.props = checkNotNull(props); this.externalSavepointLocation = externalSavepointLocation; this.isPeriodic = isPeriodic; } CompletableFuture getOnCompletionFuture() { return onCompletionPromise; } public void completeExceptionally(CheckpointException exception) { onCompletionPromise.completeExceptionally(exception); } public boolean isForce() { return props.forceCheckpoint(); } } private enum OperatorCoordinatorRestoreBehavior { /** Coordinators are always restored. If there is no checkpoint, they are restored empty. */ RESTORE_OR_RESET, /** Coordinators are restored if there was a checkpoint. */ RESTORE_IF_CHECKPOINT_PRESENT, /** Coordinators are not restored during this checkpoint restore. */ SKIP; } private void trackPendingCheckpointStats(PendingCheckpoint checkpoint) { if (statsTracker == null) { return; } Map vertices = Stream.concat( checkpoint.getCheckpointPlan().getTasksToWaitFor().stream(), checkpoint.getCheckpointPlan().getFinishedTasks().stream()) .map(Execution::getVertex) .map(ExecutionVertex::getJobVertex) .distinct() .collect( toMap( ExecutionJobVertex::getJobVertexId, ExecutionJobVertex::getParallelism)); PendingCheckpointStats pendingCheckpointStats = statsTracker.reportPendingCheckpoint( checkpoint.getCheckpointID(), checkpoint.getCheckpointTimestamp(), checkpoint.getProps(), vertices); reportFinishedTasks( pendingCheckpointStats, checkpoint.getCheckpointPlan().getFinishedTasks()); } private void reportFinishedTasks( PendingCheckpointStats pendingCheckpointStats, List finishedTasks) { long now = System.currentTimeMillis(); finishedTasks.forEach( execution -> pendingCheckpointStats.reportSubtaskStats( execution.getVertex().getJobvertexId(), new SubtaskStateStats(execution.getParallelSubtaskIndex(), now))); } @Nullable private PendingCheckpointStats getStatsCallback(PendingCheckpoint pendingCheckpoint) { return statsTracker == null ? null : statsTracker.getPendingCheckpointStats(pendingCheckpoint.getCheckpointID()); } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.13/org/apache/flink/runtime/util/EnvironmentInformation.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.runtime.util; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.configuration.GlobalConfiguration; import org.apache.flink.util.OperatingSystem; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.InputStream; import java.lang.management.ManagementFactory; import java.lang.management.RuntimeMXBean; import java.lang.reflect.Method; import java.time.Instant; import java.time.ZoneId; import java.time.format.DateTimeFormatter; import java.time.format.DateTimeParseException; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.concurrent.ConcurrentHashMap; /** * Utility class that gives access to the execution environment of the JVM, like the executing user, * startup options, or the JVM version. */ public class EnvironmentInformation { @VisibleForTesting public static final String UNKNOWN_COMMIT_ID = "DecafC0ffeeD0d0F00d"; @VisibleForTesting public static final String UNKNOWN_COMMIT_ID_ABBREV = "DeadD0d0"; private static final Logger LOG = LoggerFactory.getLogger(EnvironmentInformation.class); public static final String UNKNOWN = ""; /** * Returns the version of the code as String. * * @return The project version string. */ public static String getVersion() { return getVersionsInstance().projectVersion; } /** * Returns the version of the used Scala compiler as String. * * @return The scala version string. */ public static String getScalaVersion() { return getVersionsInstance().scalaVersion; } /** @return The Instant this version of the software was built. */ public static Instant getBuildTime() { return getVersionsInstance().gitBuildTime; } /** * @return The Instant this version of the software was built as a String using the * Europe/Berlin timezone. */ public static String getBuildTimeString() { return getVersionsInstance().gitBuildTimeStr; } /** @return The last known commit id of this version of the software. */ public static String getGitCommitId() { return getVersionsInstance().gitCommitId; } /** @return The last known abbreviated commit id of this version of the software. */ public static String getGitCommitIdAbbrev() { return getVersionsInstance().gitCommitIdAbbrev; } /** @return The Instant of the last commit of this code. */ public static Instant getGitCommitTime() { return getVersionsInstance().gitCommitTime; } /** * @return The Instant of the last commit of this code as a String using the Europe/Berlin * timezone. */ public static String getGitCommitTimeString() { return getVersionsInstance().gitCommitTimeStr; } /** * Returns the code revision (commit and commit date) of Flink, as generated by the Maven * builds. * * @return The code revision. */ public static RevisionInformation getRevisionInformation() { return new RevisionInformation(getGitCommitIdAbbrev(), getGitCommitTimeString()); } private static final class Versions { private static final Instant DEFAULT_TIME_INSTANT = Instant.EPOCH; private static final String DEFAULT_TIME_STRING = "1970-01-01T00:00:00+0000"; private String projectVersion = UNKNOWN; private String scalaVersion = UNKNOWN; private Instant gitBuildTime = DEFAULT_TIME_INSTANT; private String gitBuildTimeStr = DEFAULT_TIME_STRING; private String gitCommitId = UNKNOWN_COMMIT_ID; private String gitCommitIdAbbrev = UNKNOWN_COMMIT_ID_ABBREV; private Instant gitCommitTime = DEFAULT_TIME_INSTANT; private String gitCommitTimeStr = DEFAULT_TIME_STRING; private static final String PROP_FILE = ".flink-runtime.version.properties"; private static final String FAIL_MESSAGE = "The file " + PROP_FILE + " has not been generated correctly. You MUST run 'mvn generate-sources' in the flink-runtime module."; private String getProperty(Properties properties, String key, String defaultValue) { String value = properties.getProperty(key); if (value == null || value.charAt(0) == '$') { return defaultValue; } return value; } public Versions() { ClassLoader classLoader = EnvironmentInformation.class.getClassLoader(); try (InputStream propFile = classLoader.getResourceAsStream(PROP_FILE)) { if (propFile != null) { Properties properties = new Properties(); properties.load(propFile); projectVersion = getProperty(properties, "project.version", UNKNOWN); scalaVersion = getProperty(properties, "scala.binary.version", UNKNOWN); gitCommitId = getProperty(properties, "git.commit.id", UNKNOWN_COMMIT_ID); gitCommitIdAbbrev = getProperty( properties, "git.commit.id.abbrev", UNKNOWN_COMMIT_ID_ABBREV); // This is to reliably parse the datetime format configured in the // git-commit-id-plugin DateTimeFormatter gitDateTimeFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ssZ"); // Default format is in Berlin timezone because that is where Flink originated. DateTimeFormatter berlinDateTime = DateTimeFormatter.ISO_OFFSET_DATE_TIME.withZone( ZoneId.of("Europe/Berlin")); try { String propGitCommitTime = getProperty(properties, "git.commit.time", DEFAULT_TIME_STRING); gitCommitTime = gitDateTimeFormatter.parse(propGitCommitTime, Instant::from); gitCommitTimeStr = berlinDateTime.format(gitCommitTime); String propGitBuildTime = getProperty(properties, "git.build.time", DEFAULT_TIME_STRING); gitBuildTime = gitDateTimeFormatter.parse(propGitBuildTime, Instant::from); gitBuildTimeStr = berlinDateTime.format(gitBuildTime); } catch (DateTimeParseException dtpe) { LOG.error("{} : {}", FAIL_MESSAGE, dtpe); throw new IllegalStateException(FAIL_MESSAGE); } } } catch (IOException ioe) { LOG.info( "Cannot determine code revision: Unable to read version property file.: {}", ioe.getMessage()); } } } private static final class VersionsHolder { static final Versions INSTANCE = new Versions(); } private static Versions getVersionsInstance() { return VersionsHolder.INSTANCE; } /** * Gets the name of the user that is running the JVM. * * @return The name of the user that is running the JVM. */ public static String getHadoopUser() { try { Class ugiClass = Class.forName( "org.apache.hadoop.security.UserGroupInformation", false, EnvironmentInformation.class.getClassLoader()); Method currentUserMethod = ugiClass.getMethod("getCurrentUser"); Method shortUserNameMethod = ugiClass.getMethod("getShortUserName"); Object ugi = currentUserMethod.invoke(null); return (String) shortUserNameMethod.invoke(ugi); } catch (ClassNotFoundException e) { return ""; } catch (LinkageError e) { // hadoop classes are not in the classpath LOG.debug( "Cannot determine user/group information using Hadoop utils. " + "Hadoop classes not loaded or compatible", e); } catch (Throwable t) { // some other error occurred that we should log and make known LOG.warn("Error while accessing user/group information via Hadoop utils.", t); } return UNKNOWN; } /** * The maximum JVM heap size, in bytes. * *

This method uses the -Xmx value of the JVM, if set. If not set, it returns (as a * heuristic) 1/4th of the physical memory size. * * @return The maximum JVM heap size, in bytes. */ public static long getMaxJvmHeapMemory() { final long maxMemory = Runtime.getRuntime().maxMemory(); if (maxMemory != Long.MAX_VALUE) { // we have the proper max memory return maxMemory; } else { // max JVM heap size is not set - use the heuristic to use 1/4th of the physical memory final long physicalMemory = Hardware.getSizeOfPhysicalMemory(); if (physicalMemory != -1) { // got proper value for physical memory return physicalMemory / 4; } else { throw new RuntimeException( "Could not determine the amount of free memory.\n" + "Please set the maximum memory for the JVM, e.g. -Xmx512M for 512 megabytes."); } } } /** * Gets an estimate of the size of the free heap memory. * *

NOTE: This method is heavy-weight. It triggers a garbage collection to reduce * fragmentation and get a better estimate at the size of free memory. It is typically more * accurate than the plain version {@link #getSizeOfFreeHeapMemory()}. * * @return An estimate of the size of the free heap memory, in bytes. */ public static long getSizeOfFreeHeapMemoryWithDefrag() { // trigger a garbage collection, to reduce fragmentation System.gc(); return getSizeOfFreeHeapMemory(); } /** * Gets an estimate of the size of the free heap memory. The estimate may vary, depending on the * current level of memory fragmentation and the number of dead objects. For a better (but more * heavy-weight) estimate, use {@link #getSizeOfFreeHeapMemoryWithDefrag()}. * * @return An estimate of the size of the free heap memory, in bytes. */ public static long getSizeOfFreeHeapMemory() { Runtime r = Runtime.getRuntime(); return getMaxJvmHeapMemory() - r.totalMemory() + r.freeMemory(); } /** * Gets the version of the JVM in the form "VM_Name - Vendor - Spec/Version". * * @return The JVM version. */ public static String getJvmVersion() { try { final RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); return bean.getVmName() + " - " + bean.getVmVendor() + " - " + bean.getSpecVersion() + '/' + bean.getVmVersion(); } catch (Throwable t) { return UNKNOWN; } } /** * Gets the system parameters and environment parameters that were passed to the JVM on startup. * * @return The options passed to the JVM on startup. */ public static String getJvmStartupOptions() { try { final RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); final StringBuilder bld = new StringBuilder(); for (String s : bean.getInputArguments()) { bld.append(s).append(' '); } return bld.toString(); } catch (Throwable t) { return UNKNOWN; } } /** * Gets the system parameters and environment parameters that were passed to the JVM on startup. * * @return The options passed to the JVM on startup. */ public static String[] getJvmStartupOptionsArray() { try { RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); List options = bean.getInputArguments(); return options.toArray(new String[options.size()]); } catch (Throwable t) { return new String[0]; } } /** * Gets the directory for temporary files, as returned by the JVM system property * "java.io.tmpdir". * * @return The directory for temporary files. */ public static String getTemporaryFileDirectory() { return System.getProperty("java.io.tmpdir"); } /** * Tries to retrieve the maximum number of open file handles. This method will only work on * UNIX-based operating systems with Sun/Oracle Java versions. * *

If the number of max open file handles cannot be determined, this method returns {@code * -1}. * * @return The limit of open file handles, or {@code -1}, if the limit could not be determined. */ public static long getOpenFileHandlesLimit() { if (OperatingSystem .isWindows()) { // getMaxFileDescriptorCount method is not available on Windows return -1L; } Class sunBeanClass; try { sunBeanClass = Class.forName("com.sun.management.UnixOperatingSystemMXBean"); } catch (ClassNotFoundException e) { return -1L; } try { Method fhLimitMethod = sunBeanClass.getMethod("getMaxFileDescriptorCount"); Object result = fhLimitMethod.invoke(ManagementFactory.getOperatingSystemMXBean()); return (Long) result; } catch (Throwable t) { LOG.warn("Unexpected error when accessing file handle limit", t); return -1L; } } // TODO: ------------ start:二次开发代码 ---------------- // // 用于判断是否为JobManager private static Boolean IS_JOBMANAGER = true; private static final Map settings = new ConcurrentHashMap<>(); /** * 用不判断当前组件是否为JobManager */ public static boolean isJobManager() { return IS_JOBMANAGER; } /** * 获取配置信息 */ public static Map getSettings() { return settings; } /** * 设置配置信息 */ public static void setSetting(String key, String value) { if (!settings.containsKey(key)) settings.put(key, value); } /** * 解析命令并判断是否为JobManager */ private static void parseCommand(String[] commandLineArgs) { if (commandLineArgs != null) { for (String command : commandLineArgs) { if (command != null && command.length() > 0) { if (command.contains("resource-id")) { IS_JOBMANAGER = false; } if (!"-D".equals(command)) { String[] properties = command.replace("-D", "").split("=", 2); if (properties != null && properties.length == 2 && properties[0] != null && properties[1] != null) { settings.put(properties[0], properties[1]); } } } } } } // TODO: ------------ end:二次开发代码 ----------------- // /** * Logs information about the environment, like code revision, current user, Java version, and * JVM parameters. * * @param log The logger to log the information to. * @param componentName The component name to mention in the log. * @param commandLineArgs The arguments accompanying the starting the component. */ public static void logEnvironmentInfo( Logger log, String componentName, String[] commandLineArgs) { // TODO: ------------ start:二次开发代码 --------------- // parseCommand(commandLineArgs); // TODO: ------------ end:二次开发代码 ---------------- // if (log.isInfoEnabled()) { RevisionInformation rev = getRevisionInformation(); String version = getVersion(); String scalaVersion = getScalaVersion(); String jvmVersion = getJvmVersion(); String[] options = getJvmStartupOptionsArray(); String javaHome = System.getenv("JAVA_HOME"); String inheritedLogs = System.getenv("FLINK_INHERITED_LOGS"); long maxHeapMegabytes = getMaxJvmHeapMemory() >>> 20; if (inheritedLogs != null) { log.info( "--------------------------------------------------------------------------------"); log.info(" Preconfiguration: "); log.info(inheritedLogs); } log.info( "--------------------------------------------------------------------------------"); log.info( " Starting " + componentName + " (Version: " + version + ", Scala: " + scalaVersion + ", " + "Rev:" + rev.commitId + ", " + "Date:" + rev.commitDate + ")"); log.info(" OS current user: " + System.getProperty("user.name")); log.info(" Current Hadoop/Kerberos user: " + getHadoopUser()); log.info(" JVM: " + jvmVersion); log.info(" Maximum heap size: " + maxHeapMegabytes + " MiBytes"); log.info(" JAVA_HOME: " + (javaHome == null ? "(not set)" : javaHome)); String hadoopVersionString = getHadoopVersionString(); if (hadoopVersionString != null) { log.info(" Hadoop version: " + hadoopVersionString); } else { log.info(" No Hadoop Dependency available"); } if (options.length == 0) { log.info(" JVM Options: (none)"); } else { log.info(" JVM Options:"); for (String s : options) { log.info(" " + s); } } if (commandLineArgs == null || commandLineArgs.length == 0) { log.info(" Program Arguments: (none)"); } else { log.info(" Program Arguments:"); for (String s : commandLineArgs) { if (GlobalConfiguration.isSensitive(s)) { log.info( " " + GlobalConfiguration.HIDDEN_CONTENT + " (sensitive information)"); } else { log.info(" " + s); } } } log.info(" Classpath: " + System.getProperty("java.class.path")); log.info( "--------------------------------------------------------------------------------"); } } public static String getHadoopVersionString() { try { Class versionInfoClass = Class.forName( "org.apache.hadoop.util.VersionInfo", false, EnvironmentInformation.class.getClassLoader()); Method method = versionInfoClass.getMethod("getVersion"); return (String) method.invoke(null); } catch (ClassNotFoundException | NoSuchMethodException e) { return null; } catch (Throwable e) { LOG.error("Cannot invoke VersionInfo.getVersion reflectively.", e); return null; } } // -------------------------------------------------------------------------------------------- /** Don't instantiate this class */ private EnvironmentInformation() {} // -------------------------------------------------------------------------------------------- /** * Revision information encapsulates information about the source code revision of the Flink * code. */ public static class RevisionInformation { /** The git commit id (hash) */ public final String commitId; /** The git commit date */ public final String commitDate; public RevisionInformation(String commitId, String commitDate) { this.commitId = commitId; this.commitDate = commitDate; } } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.13/org/apache/flink/table/api/internal/TableEnvironmentImpl.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.table.api.internal; import org.apache.flink.annotation.Internal; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.JobExecutionResult; import org.apache.flink.api.dag.Pipeline; import org.apache.flink.api.dag.Transformation; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.PipelineOptions; import org.apache.flink.core.execution.JobClient; import org.apache.flink.table.api.DataTypes; import org.apache.flink.table.api.EnvironmentSettings; import org.apache.flink.table.api.ExplainDetail; import org.apache.flink.table.api.ResultKind; import org.apache.flink.table.api.SqlParserException; import org.apache.flink.table.api.StatementSet; import org.apache.flink.table.api.Table; import org.apache.flink.table.api.TableConfig; import org.apache.flink.table.api.TableEnvironment; import org.apache.flink.table.api.TableException; import org.apache.flink.table.api.TableResult; import org.apache.flink.table.api.TableSchema; import org.apache.flink.table.api.ValidationException; import org.apache.flink.table.catalog.Catalog; import org.apache.flink.table.catalog.CatalogBaseTable; import org.apache.flink.table.catalog.CatalogFunction; import org.apache.flink.table.catalog.CatalogManager; import org.apache.flink.table.catalog.CatalogPartition; import org.apache.flink.table.catalog.CatalogPartitionSpec; import org.apache.flink.table.catalog.CatalogTable; import org.apache.flink.table.catalog.CatalogTableImpl; import org.apache.flink.table.catalog.Column; import org.apache.flink.table.catalog.ConnectorCatalogTable; import org.apache.flink.table.catalog.FunctionCatalog; import org.apache.flink.table.catalog.GenericInMemoryCatalog; import org.apache.flink.table.catalog.ObjectIdentifier; import org.apache.flink.table.catalog.ObjectPath; import org.apache.flink.table.catalog.QueryOperationCatalogView; import org.apache.flink.table.catalog.ResolvedSchema; import org.apache.flink.table.catalog.UnresolvedIdentifier; import org.apache.flink.table.catalog.WatermarkSpec; import org.apache.flink.table.catalog.exceptions.CatalogException; import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; import org.apache.flink.table.catalog.exceptions.FunctionAlreadyExistException; import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; import org.apache.flink.table.catalog.exceptions.TableNotExistException; import org.apache.flink.table.delegation.Executor; import org.apache.flink.table.delegation.ExecutorFactory; import org.apache.flink.table.delegation.Parser; import org.apache.flink.table.delegation.Planner; import org.apache.flink.table.delegation.PlannerFactory; import org.apache.flink.table.descriptors.ConnectTableDescriptor; import org.apache.flink.table.descriptors.ConnectorDescriptor; import org.apache.flink.table.descriptors.StreamTableDescriptor; import org.apache.flink.table.expressions.ApiExpressionUtils; import org.apache.flink.table.expressions.Expression; import org.apache.flink.table.factories.ComponentFactoryService; import org.apache.flink.table.factories.FactoryUtil; import org.apache.flink.table.factories.ModuleFactory; import org.apache.flink.table.factories.TableFactoryService; import org.apache.flink.table.functions.ScalarFunction; import org.apache.flink.table.functions.UserDefinedFunction; import org.apache.flink.table.functions.UserDefinedFunctionHelper; import org.apache.flink.table.module.Module; import org.apache.flink.table.module.ModuleEntry; import org.apache.flink.table.module.ModuleManager; import org.apache.flink.table.operations.CatalogQueryOperation; import org.apache.flink.table.operations.CatalogSinkModifyOperation; import org.apache.flink.table.operations.CollectModifyOperation; import org.apache.flink.table.operations.DescribeTableOperation; import org.apache.flink.table.operations.ExplainOperation; import org.apache.flink.table.operations.LoadModuleOperation; import org.apache.flink.table.operations.ModifyOperation; import org.apache.flink.table.operations.NopOperation; import org.apache.flink.table.operations.Operation; import org.apache.flink.table.operations.QueryOperation; import org.apache.flink.table.operations.ShowCatalogsOperation; import org.apache.flink.table.operations.ShowCurrentCatalogOperation; import org.apache.flink.table.operations.ShowCurrentDatabaseOperation; import org.apache.flink.table.operations.ShowDatabasesOperation; import org.apache.flink.table.operations.ShowFunctionsOperation; import org.apache.flink.table.operations.ShowModulesOperation; import org.apache.flink.table.operations.ShowPartitionsOperation; import org.apache.flink.table.operations.ShowTablesOperation; import org.apache.flink.table.operations.ShowViewsOperation; import org.apache.flink.table.operations.TableSourceQueryOperation; import org.apache.flink.table.operations.UnloadModuleOperation; import org.apache.flink.table.operations.UseCatalogOperation; import org.apache.flink.table.operations.UseDatabaseOperation; import org.apache.flink.table.operations.UseModulesOperation; import org.apache.flink.table.operations.ddl.AddPartitionsOperation; import org.apache.flink.table.operations.ddl.AlterCatalogFunctionOperation; import org.apache.flink.table.operations.ddl.AlterDatabaseOperation; import org.apache.flink.table.operations.ddl.AlterPartitionPropertiesOperation; import org.apache.flink.table.operations.ddl.AlterTableAddConstraintOperation; import org.apache.flink.table.operations.ddl.AlterTableDropConstraintOperation; import org.apache.flink.table.operations.ddl.AlterTableOperation; import org.apache.flink.table.operations.ddl.AlterTableOptionsOperation; import org.apache.flink.table.operations.ddl.AlterTableRenameOperation; import org.apache.flink.table.operations.ddl.AlterTableSchemaOperation; import org.apache.flink.table.operations.ddl.AlterViewAsOperation; import org.apache.flink.table.operations.ddl.AlterViewOperation; import org.apache.flink.table.operations.ddl.AlterViewPropertiesOperation; import org.apache.flink.table.operations.ddl.AlterViewRenameOperation; import org.apache.flink.table.operations.ddl.CreateCatalogFunctionOperation; import org.apache.flink.table.operations.ddl.CreateCatalogOperation; import org.apache.flink.table.operations.ddl.CreateDatabaseOperation; import org.apache.flink.table.operations.ddl.CreateTableASOperation; import org.apache.flink.table.operations.ddl.CreateTableOperation; import org.apache.flink.table.operations.ddl.CreateTempSystemFunctionOperation; import org.apache.flink.table.operations.ddl.CreateViewOperation; import org.apache.flink.table.operations.ddl.DropCatalogFunctionOperation; import org.apache.flink.table.operations.ddl.DropCatalogOperation; import org.apache.flink.table.operations.ddl.DropDatabaseOperation; import org.apache.flink.table.operations.ddl.DropPartitionsOperation; import org.apache.flink.table.operations.ddl.DropTableOperation; import org.apache.flink.table.operations.ddl.DropTempSystemFunctionOperation; import org.apache.flink.table.operations.ddl.DropViewOperation; import org.apache.flink.table.operations.utils.OperationTreeBuilder; import org.apache.flink.table.sinks.TableSink; import org.apache.flink.table.sources.TableSource; import org.apache.flink.table.sources.TableSourceValidation; import org.apache.flink.table.types.AbstractDataType; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.utils.PrintUtils; import org.apache.flink.table.utils.TableSchemaUtils; import org.apache.flink.types.Row; import org.apache.flink.util.Preconditions; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.concurrent.ExecutionException; import java.util.stream.Collectors; import java.util.stream.StreamSupport; import static org.apache.flink.table.api.config.TableConfigOptions.TABLE_DML_SYNC; import static org.apache.flink.table.descriptors.ModuleDescriptorValidator.MODULE_TYPE; /** * Implementation of {@link TableEnvironment} that works exclusively with Table API interfaces. Only * {@link TableSource} is supported as an input and {@link TableSink} as an output. It also does not * bind to any particular {@code StreamExecutionEnvironment}. */ @Internal public class TableEnvironmentImpl implements TableEnvironmentInternal { // Flag that tells if the TableSource/TableSink used in this environment is stream table // source/sink, // and this should always be true. This avoids too many hard code. private static final boolean IS_STREAM_TABLE = true; private final CatalogManager catalogManager; private final ModuleManager moduleManager; private final OperationTreeBuilder operationTreeBuilder; private final List bufferedModifyOperations = new ArrayList<>(); protected final TableConfig tableConfig; protected final Executor execEnv; protected final FunctionCatalog functionCatalog; protected final Planner planner; private final boolean isStreamingMode; private final ClassLoader userClassLoader; private static final String UNSUPPORTED_QUERY_IN_SQL_UPDATE_MSG = "Unsupported SQL query! sqlUpdate() only accepts a single SQL statement of type " + "INSERT, CREATE TABLE, DROP TABLE, ALTER TABLE, USE CATALOG, USE [CATALOG.]DATABASE, " + "CREATE DATABASE, DROP DATABASE, ALTER DATABASE, CREATE FUNCTION, DROP FUNCTION, ALTER FUNCTION, " + "CREATE CATALOG, DROP CATALOG, CREATE VIEW, DROP VIEW, LOAD MODULE, UNLOAD " + "MODULE, USE MODULES."; private static final String UNSUPPORTED_QUERY_IN_EXECUTE_SQL_MSG = "Unsupported SQL query! executeSql() only accepts a single SQL statement of type " + "CREATE TABLE, DROP TABLE, ALTER TABLE, CREATE DATABASE, DROP DATABASE, ALTER DATABASE, " + "CREATE FUNCTION, DROP FUNCTION, ALTER FUNCTION, CREATE CATALOG, DROP CATALOG, " + "USE CATALOG, USE [CATALOG.]DATABASE, SHOW CATALOGS, SHOW DATABASES, SHOW TABLES, SHOW [USER] FUNCTIONS, SHOW PARTITIONS" + "CREATE VIEW, DROP VIEW, SHOW VIEWS, INSERT, DESCRIBE, LOAD MODULE, UNLOAD " + "MODULE, USE MODULES, SHOW [FULL] MODULES."; /** Provides necessary methods for {@link ConnectTableDescriptor}. */ private final Registration registration = new Registration() { @Override public void createTemporaryTable(String path, CatalogBaseTable table) { UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(path); ObjectIdentifier objectIdentifier = catalogManager.qualifyIdentifier(unresolvedIdentifier); catalogManager.createTemporaryTable(table, objectIdentifier, false); } }; protected TableEnvironmentImpl( CatalogManager catalogManager, ModuleManager moduleManager, TableConfig tableConfig, Executor executor, FunctionCatalog functionCatalog, Planner planner, boolean isStreamingMode, ClassLoader userClassLoader) { this.catalogManager = catalogManager; this.moduleManager = moduleManager; this.execEnv = executor; this.tableConfig = tableConfig; this.functionCatalog = functionCatalog; this.planner = planner; this.isStreamingMode = isStreamingMode; this.userClassLoader = userClassLoader; this.operationTreeBuilder = OperationTreeBuilder.create( tableConfig, functionCatalog.asLookup(getParser()::parseIdentifier), catalogManager.getDataTypeFactory(), path -> { try { UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(path); Optional catalogQueryOperation = scanInternal(unresolvedIdentifier); return catalogQueryOperation.map( t -> ApiExpressionUtils.tableRef(path, t)); } catch (SqlParserException ex) { // The TableLookup is used during resolution of expressions and it // actually might not be an // identifier of a table. It might be a reference to some other // object such as column, local // reference etc. This method should return empty optional in such // cases to fallback for other // identifiers resolution. return Optional.empty(); } }, (sqlExpression, inputSchema) -> { try { return getParser().parseSqlExpression(sqlExpression, inputSchema); } catch (Throwable t) { throw new ValidationException( String.format("Invalid SQL expression: %s", sqlExpression), t); } }, isStreamingMode); catalogManager.initSchemaResolver( isStreamingMode, operationTreeBuilder.getResolverBuilder()); } public static TableEnvironmentImpl create(Configuration configuration) { return create(EnvironmentSettings.fromConfiguration(configuration), configuration); } public static TableEnvironmentImpl create(EnvironmentSettings settings) { return create(settings, settings.toConfiguration()); } private static TableEnvironmentImpl create( EnvironmentSettings settings, Configuration configuration) { // temporary solution until FLINK-15635 is fixed ClassLoader classLoader = Thread.currentThread().getContextClassLoader(); // use configuration to init table config TableConfig tableConfig = new TableConfig(); tableConfig.addConfiguration(configuration); ModuleManager moduleManager = new ModuleManager(); CatalogManager catalogManager = CatalogManager.newBuilder() .classLoader(classLoader) .config(tableConfig.getConfiguration()) .defaultCatalog( settings.getBuiltInCatalogName(), new GenericInMemoryCatalog( settings.getBuiltInCatalogName(), settings.getBuiltInDatabaseName())) .build(); FunctionCatalog functionCatalog = new FunctionCatalog(tableConfig, catalogManager, moduleManager); Map executorProperties = settings.toExecutorProperties(); Executor executor = ComponentFactoryService.find(ExecutorFactory.class, executorProperties) .create(executorProperties); Map plannerProperties = settings.toPlannerProperties(); Planner planner = ComponentFactoryService.find(PlannerFactory.class, plannerProperties) .create( plannerProperties, executor, tableConfig, functionCatalog, catalogManager); return new TableEnvironmentImpl( catalogManager, moduleManager, tableConfig, executor, functionCatalog, planner, settings.isStreamingMode(), classLoader); } @Override public Table fromValues(Object... values) { return fromValues(Arrays.asList(values)); } @Override public Table fromValues(AbstractDataType rowType, Object... values) { return fromValues(rowType, Arrays.asList(values)); } @Override public Table fromValues(Expression... values) { return createTable(operationTreeBuilder.values(values)); } @Override public Table fromValues(AbstractDataType rowType, Expression... values) { final DataType resolvedDataType = catalogManager.getDataTypeFactory().createDataType(rowType); return createTable(operationTreeBuilder.values(resolvedDataType, values)); } @Override public Table fromValues(Iterable values) { Expression[] exprs = StreamSupport.stream(values.spliterator(), false) .map(ApiExpressionUtils::objectToExpression) .toArray(Expression[]::new); return fromValues(exprs); } @Override public Table fromValues(AbstractDataType rowType, Iterable values) { Expression[] exprs = StreamSupport.stream(values.spliterator(), false) .map(ApiExpressionUtils::objectToExpression) .toArray(Expression[]::new); return fromValues(rowType, exprs); } @VisibleForTesting public Planner getPlanner() { return planner; } @Override public Table fromTableSource(TableSource source) { // only accept StreamTableSource and LookupableTableSource here // TODO should add a validation, while StreamTableSource is in flink-table-api-java-bridge // module now return createTable(new TableSourceQueryOperation<>(source, !IS_STREAM_TABLE)); } @Override public void registerCatalog(String catalogName, Catalog catalog) { catalogManager.registerCatalog(catalogName, catalog); } @Override public Optional getCatalog(String catalogName) { return catalogManager.getCatalog(catalogName); } @Override public void loadModule(String moduleName, Module module) { moduleManager.loadModule(moduleName, module); } @Override public void useModules(String... moduleNames) { moduleManager.useModules(moduleNames); } @Override public void unloadModule(String moduleName) { moduleManager.unloadModule(moduleName); } @Override public void registerFunction(String name, ScalarFunction function) { functionCatalog.registerTempSystemScalarFunction(name, function); } @Override public void createTemporarySystemFunction( String name, Class functionClass) { final UserDefinedFunction functionInstance = UserDefinedFunctionHelper.instantiateFunction(functionClass); createTemporarySystemFunction(name, functionInstance); } @Override public void createTemporarySystemFunction(String name, UserDefinedFunction functionInstance) { functionCatalog.registerTemporarySystemFunction(name, functionInstance, false); } @Override public boolean dropTemporarySystemFunction(String name) { return functionCatalog.dropTemporarySystemFunction(name, true); } @Override public void createFunction(String path, Class functionClass) { createFunction(path, functionClass, false); } @Override public void createFunction( String path, Class functionClass, boolean ignoreIfExists) { final UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(path); functionCatalog.registerCatalogFunction( unresolvedIdentifier, functionClass, ignoreIfExists); } @Override public boolean dropFunction(String path) { final UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(path); return functionCatalog.dropCatalogFunction(unresolvedIdentifier, true); } @Override public void createTemporaryFunction( String path, Class functionClass) { final UserDefinedFunction functionInstance = UserDefinedFunctionHelper.instantiateFunction(functionClass); createTemporaryFunction(path, functionInstance); } @Override public void createTemporaryFunction(String path, UserDefinedFunction functionInstance) { final UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(path); functionCatalog.registerTemporaryCatalogFunction( unresolvedIdentifier, functionInstance, false); } @Override public boolean dropTemporaryFunction(String path) { final UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(path); return functionCatalog.dropTemporaryCatalogFunction(unresolvedIdentifier, true); } @Override public void registerTable(String name, Table table) { UnresolvedIdentifier identifier = UnresolvedIdentifier.of(name); createTemporaryView(identifier, table); } @Override public void createTemporaryView(String path, Table view) { Preconditions.checkNotNull(path, "Path must not be null."); Preconditions.checkNotNull(view, "Table view must not be null."); UnresolvedIdentifier identifier = getParser().parseIdentifier(path); createTemporaryView(identifier, view); } private void createTemporaryView(UnresolvedIdentifier identifier, Table view) { if (((TableImpl) view).getTableEnvironment() != this) { throw new TableException( "Only table API objects that belong to this TableEnvironment can be registered."); } ObjectIdentifier tableIdentifier = catalogManager.qualifyIdentifier(identifier); QueryOperation queryOperation = qualifyQueryOperation(tableIdentifier, view.getQueryOperation()); CatalogBaseTable tableTable = new QueryOperationCatalogView(queryOperation); catalogManager.createTemporaryTable(tableTable, tableIdentifier, false); } @Override public Table scan(String... tablePath) { UnresolvedIdentifier unresolvedIdentifier = UnresolvedIdentifier.of(tablePath); return scanInternal(unresolvedIdentifier) .map(this::createTable) .orElseThrow( () -> new ValidationException( String.format( "Table %s was not found.", unresolvedIdentifier))); } @Override public Table from(String path) { UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(path); return scanInternal(unresolvedIdentifier) .map(this::createTable) .orElseThrow( () -> new ValidationException( String.format( "Table %s was not found.", unresolvedIdentifier))); } @Override public void insertInto(String targetPath, Table table) { UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(targetPath); insertIntoInternal(unresolvedIdentifier, table); } @Override public void insertInto(Table table, String sinkPath, String... sinkPathContinued) { List fullPath = new ArrayList<>(Arrays.asList(sinkPathContinued)); fullPath.add(0, sinkPath); UnresolvedIdentifier unresolvedIdentifier = UnresolvedIdentifier.of(fullPath); insertIntoInternal(unresolvedIdentifier, table); } private void insertIntoInternal(UnresolvedIdentifier unresolvedIdentifier, Table table) { ObjectIdentifier objectIdentifier = catalogManager.qualifyIdentifier(unresolvedIdentifier); List modifyOperations = Collections.singletonList( new CatalogSinkModifyOperation( objectIdentifier, table.getQueryOperation())); buffer(modifyOperations); } private Optional scanInternal(UnresolvedIdentifier identifier) { ObjectIdentifier tableIdentifier = catalogManager.qualifyIdentifier(identifier); return catalogManager .getTable(tableIdentifier) .map(t -> new CatalogQueryOperation(tableIdentifier, t.getResolvedSchema())); } @Override public ConnectTableDescriptor connect(ConnectorDescriptor connectorDescriptor) { return new StreamTableDescriptor(registration, connectorDescriptor); } @Override public String[] listCatalogs() { return catalogManager.listCatalogs().stream().sorted().toArray(String[]::new); } @Override public String[] listModules() { return moduleManager.listModules().toArray(new String[0]); } @Override public ModuleEntry[] listFullModules() { return moduleManager.listFullModules().toArray(new ModuleEntry[0]); } @Override public String[] listDatabases() { return catalogManager .getCatalog(catalogManager.getCurrentCatalog()) .get() .listDatabases() .toArray(new String[0]); } @Override public String[] listTables() { return catalogManager.listTables().stream().sorted().toArray(String[]::new); } @Override public String[] listViews() { return catalogManager.listViews().stream().sorted().toArray(String[]::new); } @Override public String[] listTemporaryTables() { return catalogManager.listTemporaryTables().stream().sorted().toArray(String[]::new); } @Override public String[] listTemporaryViews() { return catalogManager.listTemporaryViews().stream().sorted().toArray(String[]::new); } @Override public boolean dropTemporaryTable(String path) { UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(path); ObjectIdentifier identifier = catalogManager.qualifyIdentifier(unresolvedIdentifier); try { catalogManager.dropTemporaryTable(identifier, false); return true; } catch (ValidationException e) { return false; } } @Override public boolean dropTemporaryView(String path) { UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(path); ObjectIdentifier identifier = catalogManager.qualifyIdentifier(unresolvedIdentifier); try { catalogManager.dropTemporaryView(identifier, false); return true; } catch (ValidationException e) { return false; } } @Override public String[] listUserDefinedFunctions() { String[] functions = functionCatalog.getUserDefinedFunctions(); Arrays.sort(functions); return functions; } @Override public String[] listFunctions() { String[] functions = functionCatalog.getFunctions(); Arrays.sort(functions); return functions; } @Override public String explain(Table table) { return explain(table, false); } @Override public String explain(Table table, boolean extended) { return planner.explain( Collections.singletonList(table.getQueryOperation()), getExplainDetails(extended)); } @Override public String explain(boolean extended) { List operations = bufferedModifyOperations.stream() .map(o -> (Operation) o) .collect(Collectors.toList()); return planner.explain(operations, getExplainDetails(extended)); } @Override public String explainSql(String statement, ExplainDetail... extraDetails) { List operations = getParser().parse(statement); if (operations.size() != 1) { throw new TableException( "Unsupported SQL query! explainSql() only accepts a single SQL query."); } return explainInternal(operations, extraDetails); } @Override public String explainInternal(List operations, ExplainDetail... extraDetails) { operations = operations.stream() .filter(o -> !(o instanceof NopOperation)) .collect(Collectors.toList()); // hive parser may generate an NopOperation, in which case we just return an // empty string as the plan if (operations.isEmpty()) { return ""; } else { return planner.explain(operations, extraDetails); } } @Override public String[] getCompletionHints(String statement, int position) { return planner.getParser().getCompletionHints(statement, position); } @Override public Table sqlQuery(String query) { List operations = getParser().parse(query); if (operations.size() != 1) { throw new ValidationException( "Unsupported SQL query! sqlQuery() only accepts a single SQL query."); } Operation operation = operations.get(0); if (operation instanceof QueryOperation && !(operation instanceof ModifyOperation)) { return createTable((QueryOperation) operation); } else { throw new ValidationException( "Unsupported SQL query! sqlQuery() only accepts a single SQL query of type " + "SELECT, UNION, INTERSECT, EXCEPT, VALUES, and ORDER_BY."); } } @Override public TableResult executeSql(String statement) { List operations = getParser().parse(statement); if (operations.size() != 1) { throw new TableException(UNSUPPORTED_QUERY_IN_EXECUTE_SQL_MSG); } return executeInternal(operations.get(0)); } @Override public StatementSet createStatementSet() { return new StatementSetImpl(this); } @Override public TableResult executeInternal(List operations) { List> transformations = translate(operations); List sinkIdentifierNames = extractSinkIdentifierNames(operations); TableResult result = executeInternal(transformations, sinkIdentifierNames); if (tableConfig.getConfiguration().get(TABLE_DML_SYNC)) { try { result.await(); } catch (InterruptedException | ExecutionException e) { result.getJobClient().ifPresent(JobClient::cancel); throw new TableException("Fail to wait execution finish.", e); } } return result; } private TableResult executeInternal( List> transformations, List sinkIdentifierNames) { String jobName = getJobName("insert-into_" + String.join(",", sinkIdentifierNames)); Pipeline pipeline = execEnv.createPipeline(transformations, tableConfig, jobName); try { JobClient jobClient = execEnv.executeAsync(pipeline); final List columns = new ArrayList<>(); Object[] affectedRowCounts = new Long[transformations.size()]; for (int i = 0; i < transformations.size(); ++i) { // use sink identifier name as field name columns.add(Column.physical(sinkIdentifierNames.get(i), DataTypes.BIGINT())); affectedRowCounts[i] = -1L; } return TableResultImpl.builder() .jobClient(jobClient) .resultKind(ResultKind.SUCCESS_WITH_CONTENT) .schema(ResolvedSchema.of(columns)) .data( new InsertResultIterator( jobClient, Row.of(affectedRowCounts), userClassLoader)) .build(); } catch (Exception e) { throw new TableException("Failed to execute sql", e); } } private TableResult executeQueryOperation(QueryOperation operation) { final UnresolvedIdentifier unresolvedIdentifier = UnresolvedIdentifier.of( "Unregistered_Collect_Sink_" + CollectModifyOperation.getUniqueId()); final ObjectIdentifier objectIdentifier = catalogManager.qualifyIdentifier(unresolvedIdentifier); CollectModifyOperation sinkOperation = new CollectModifyOperation(objectIdentifier, operation); List> transformations = translate(Collections.singletonList(sinkOperation)); String jobName = getJobName("collect"); Pipeline pipeline = execEnv.createPipeline(transformations, tableConfig, jobName); try { JobClient jobClient = execEnv.executeAsync(pipeline); CollectResultProvider resultProvider = sinkOperation.getSelectResultProvider(); resultProvider.setJobClient(jobClient); return TableResultImpl.builder() .jobClient(jobClient) .resultKind(ResultKind.SUCCESS_WITH_CONTENT) .schema(operation.getResolvedSchema()) .data(resultProvider.getResultIterator()) .setPrintStyle( TableResultImpl.PrintStyle.tableau( PrintUtils.MAX_COLUMN_WIDTH, PrintUtils.NULL_COLUMN, true, isStreamingMode)) .setSessionTimeZone(getConfig().getLocalTimeZone()) .build(); } catch (Exception e) { throw new TableException("Failed to execute sql", e); } } @Override public void sqlUpdate(String stmt) { List operations = getParser().parse(stmt); if (operations.size() != 1) { throw new TableException(UNSUPPORTED_QUERY_IN_SQL_UPDATE_MSG); } Operation operation = operations.get(0); if (operation instanceof ModifyOperation) { buffer(Collections.singletonList((ModifyOperation) operation)); } else if (operation instanceof CreateTableOperation || operation instanceof DropTableOperation || operation instanceof AlterTableOperation || operation instanceof CreateViewOperation || operation instanceof DropViewOperation || operation instanceof CreateDatabaseOperation || operation instanceof DropDatabaseOperation || operation instanceof AlterDatabaseOperation || operation instanceof CreateCatalogFunctionOperation || operation instanceof CreateTempSystemFunctionOperation || operation instanceof DropCatalogFunctionOperation || operation instanceof DropTempSystemFunctionOperation || operation instanceof AlterCatalogFunctionOperation || operation instanceof CreateCatalogOperation || operation instanceof DropCatalogOperation || operation instanceof UseCatalogOperation || operation instanceof UseDatabaseOperation || operation instanceof LoadModuleOperation || operation instanceof UnloadModuleOperation || operation instanceof NopOperation) { executeInternal(operation); } else { throw new TableException(UNSUPPORTED_QUERY_IN_SQL_UPDATE_MSG); } } @Override public TableResult executeInternal(Operation operation) { if (operation instanceof ModifyOperation) { return executeInternal(Collections.singletonList((ModifyOperation) operation)); } else if (operation instanceof CreateTableOperation) { CreateTableOperation createTableOperation = (CreateTableOperation) operation; if (createTableOperation.isTemporary()) { catalogManager.createTemporaryTable( createTableOperation.getCatalogTable(), createTableOperation.getTableIdentifier(), createTableOperation.isIgnoreIfExists()); } else { catalogManager.createTable( createTableOperation.getCatalogTable(), createTableOperation.getTableIdentifier(), createTableOperation.isIgnoreIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof DropTableOperation) { DropTableOperation dropTableOperation = (DropTableOperation) operation; if (dropTableOperation.isTemporary()) { catalogManager.dropTemporaryTable( dropTableOperation.getTableIdentifier(), dropTableOperation.isIfExists()); } else { catalogManager.dropTable( dropTableOperation.getTableIdentifier(), dropTableOperation.isIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof AlterTableOperation) { AlterTableOperation alterTableOperation = (AlterTableOperation) operation; Catalog catalog = getCatalogOrThrowException( alterTableOperation.getTableIdentifier().getCatalogName()); String exMsg = getDDLOpExecuteErrorMsg(alterTableOperation.asSummaryString()); try { if (alterTableOperation instanceof AlterTableRenameOperation) { AlterTableRenameOperation alterTableRenameOp = (AlterTableRenameOperation) operation; catalog.renameTable( alterTableRenameOp.getTableIdentifier().toObjectPath(), alterTableRenameOp.getNewTableIdentifier().getObjectName(), false); } else if (alterTableOperation instanceof AlterTableOptionsOperation) { AlterTableOptionsOperation alterTablePropertiesOp = (AlterTableOptionsOperation) operation; catalogManager.alterTable( alterTablePropertiesOp.getCatalogTable(), alterTablePropertiesOp.getTableIdentifier(), false); } else if (alterTableOperation instanceof AlterTableAddConstraintOperation) { AlterTableAddConstraintOperation addConstraintOP = (AlterTableAddConstraintOperation) operation; CatalogTable oriTable = (CatalogTable) catalogManager .getTable(addConstraintOP.getTableIdentifier()) .get() .getTable(); TableSchema.Builder builder = TableSchemaUtils.builderWithGivenSchema(oriTable.getSchema()); if (addConstraintOP.getConstraintName().isPresent()) { builder.primaryKey( addConstraintOP.getConstraintName().get(), addConstraintOP.getColumnNames()); } else { builder.primaryKey(addConstraintOP.getColumnNames()); } CatalogTable newTable = new CatalogTableImpl( builder.build(), oriTable.getPartitionKeys(), oriTable.getOptions(), oriTable.getComment()); catalogManager.alterTable( newTable, addConstraintOP.getTableIdentifier(), false); } else if (alterTableOperation instanceof AlterTableDropConstraintOperation) { AlterTableDropConstraintOperation dropConstraintOperation = (AlterTableDropConstraintOperation) operation; CatalogTable oriTable = (CatalogTable) catalogManager .getTable(dropConstraintOperation.getTableIdentifier()) .get() .getTable(); CatalogTable newTable = new CatalogTableImpl( TableSchemaUtils.dropConstraint( oriTable.getSchema(), dropConstraintOperation.getConstraintName()), oriTable.getPartitionKeys(), oriTable.getOptions(), oriTable.getComment()); catalogManager.alterTable( newTable, dropConstraintOperation.getTableIdentifier(), false); } else if (alterTableOperation instanceof AlterPartitionPropertiesOperation) { AlterPartitionPropertiesOperation alterPartPropsOp = (AlterPartitionPropertiesOperation) operation; catalog.alterPartition( alterPartPropsOp.getTableIdentifier().toObjectPath(), alterPartPropsOp.getPartitionSpec(), alterPartPropsOp.getCatalogPartition(), false); } else if (alterTableOperation instanceof AlterTableSchemaOperation) { AlterTableSchemaOperation alterTableSchemaOperation = (AlterTableSchemaOperation) alterTableOperation; catalogManager.alterTable( alterTableSchemaOperation.getCatalogTable(), alterTableSchemaOperation.getTableIdentifier(), false); } else if (alterTableOperation instanceof AddPartitionsOperation) { AddPartitionsOperation addPartitionsOperation = (AddPartitionsOperation) alterTableOperation; List specs = addPartitionsOperation.getPartitionSpecs(); List partitions = addPartitionsOperation.getCatalogPartitions(); boolean ifNotExists = addPartitionsOperation.ifNotExists(); ObjectPath tablePath = addPartitionsOperation.getTableIdentifier().toObjectPath(); for (int i = 0; i < specs.size(); i++) { catalog.createPartition( tablePath, specs.get(i), partitions.get(i), ifNotExists); } } else if (alterTableOperation instanceof DropPartitionsOperation) { DropPartitionsOperation dropPartitionsOperation = (DropPartitionsOperation) alterTableOperation; ObjectPath tablePath = dropPartitionsOperation.getTableIdentifier().toObjectPath(); boolean ifExists = dropPartitionsOperation.ifExists(); for (CatalogPartitionSpec spec : dropPartitionsOperation.getPartitionSpecs()) { catalog.dropPartition(tablePath, spec, ifExists); } } return TableResultImpl.TABLE_RESULT_OK; } catch (TableAlreadyExistException | TableNotExistException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof CreateViewOperation) { CreateViewOperation createViewOperation = (CreateViewOperation) operation; if (createViewOperation.isTemporary()) { catalogManager.createTemporaryTable( createViewOperation.getCatalogView(), createViewOperation.getViewIdentifier(), createViewOperation.isIgnoreIfExists()); } else { catalogManager.createTable( createViewOperation.getCatalogView(), createViewOperation.getViewIdentifier(), createViewOperation.isIgnoreIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof DropViewOperation) { DropViewOperation dropViewOperation = (DropViewOperation) operation; if (dropViewOperation.isTemporary()) { catalogManager.dropTemporaryView( dropViewOperation.getViewIdentifier(), dropViewOperation.isIfExists()); } else { catalogManager.dropView( dropViewOperation.getViewIdentifier(), dropViewOperation.isIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof AlterViewOperation) { AlterViewOperation alterViewOperation = (AlterViewOperation) operation; Catalog catalog = getCatalogOrThrowException( alterViewOperation.getViewIdentifier().getCatalogName()); String exMsg = getDDLOpExecuteErrorMsg(alterViewOperation.asSummaryString()); try { if (alterViewOperation instanceof AlterViewRenameOperation) { AlterViewRenameOperation alterTableRenameOp = (AlterViewRenameOperation) operation; catalog.renameTable( alterTableRenameOp.getViewIdentifier().toObjectPath(), alterTableRenameOp.getNewViewIdentifier().getObjectName(), false); } else if (alterViewOperation instanceof AlterViewPropertiesOperation) { AlterViewPropertiesOperation alterTablePropertiesOp = (AlterViewPropertiesOperation) operation; catalogManager.alterTable( alterTablePropertiesOp.getCatalogView(), alterTablePropertiesOp.getViewIdentifier(), false); } else if (alterViewOperation instanceof AlterViewAsOperation) { AlterViewAsOperation alterViewAsOperation = (AlterViewAsOperation) alterViewOperation; catalogManager.alterTable( alterViewAsOperation.getNewView(), alterViewAsOperation.getViewIdentifier(), false); } return TableResultImpl.TABLE_RESULT_OK; } catch (TableAlreadyExistException | TableNotExistException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof CreateDatabaseOperation) { CreateDatabaseOperation createDatabaseOperation = (CreateDatabaseOperation) operation; Catalog catalog = getCatalogOrThrowException(createDatabaseOperation.getCatalogName()); String exMsg = getDDLOpExecuteErrorMsg(createDatabaseOperation.asSummaryString()); try { catalog.createDatabase( createDatabaseOperation.getDatabaseName(), createDatabaseOperation.getCatalogDatabase(), createDatabaseOperation.isIgnoreIfExists()); return TableResultImpl.TABLE_RESULT_OK; } catch (DatabaseAlreadyExistException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof DropDatabaseOperation) { DropDatabaseOperation dropDatabaseOperation = (DropDatabaseOperation) operation; Catalog catalog = getCatalogOrThrowException(dropDatabaseOperation.getCatalogName()); String exMsg = getDDLOpExecuteErrorMsg(dropDatabaseOperation.asSummaryString()); try { catalog.dropDatabase( dropDatabaseOperation.getDatabaseName(), dropDatabaseOperation.isIfExists(), dropDatabaseOperation.isCascade()); return TableResultImpl.TABLE_RESULT_OK; } catch (DatabaseNotExistException | DatabaseNotEmptyException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof AlterDatabaseOperation) { AlterDatabaseOperation alterDatabaseOperation = (AlterDatabaseOperation) operation; Catalog catalog = getCatalogOrThrowException(alterDatabaseOperation.getCatalogName()); String exMsg = getDDLOpExecuteErrorMsg(alterDatabaseOperation.asSummaryString()); try { catalog.alterDatabase( alterDatabaseOperation.getDatabaseName(), alterDatabaseOperation.getCatalogDatabase(), false); return TableResultImpl.TABLE_RESULT_OK; } catch (DatabaseNotExistException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof CreateCatalogFunctionOperation) { return createCatalogFunction((CreateCatalogFunctionOperation) operation); } else if (operation instanceof CreateTempSystemFunctionOperation) { return createSystemFunction((CreateTempSystemFunctionOperation) operation); } else if (operation instanceof DropCatalogFunctionOperation) { return dropCatalogFunction((DropCatalogFunctionOperation) operation); } else if (operation instanceof DropTempSystemFunctionOperation) { return dropSystemFunction((DropTempSystemFunctionOperation) operation); } else if (operation instanceof AlterCatalogFunctionOperation) { return alterCatalogFunction((AlterCatalogFunctionOperation) operation); } else if (operation instanceof CreateCatalogOperation) { return createCatalog((CreateCatalogOperation) operation); } else if (operation instanceof DropCatalogOperation) { DropCatalogOperation dropCatalogOperation = (DropCatalogOperation) operation; String exMsg = getDDLOpExecuteErrorMsg(dropCatalogOperation.asSummaryString()); try { catalogManager.unregisterCatalog( dropCatalogOperation.getCatalogName(), dropCatalogOperation.isIfExists()); return TableResultImpl.TABLE_RESULT_OK; } catch (CatalogException e) { throw new ValidationException(exMsg, e); } } else if (operation instanceof LoadModuleOperation) { return loadModule((LoadModuleOperation) operation); } else if (operation instanceof UnloadModuleOperation) { return unloadModule((UnloadModuleOperation) operation); } else if (operation instanceof UseModulesOperation) { return useModules((UseModulesOperation) operation); } else if (operation instanceof UseCatalogOperation) { UseCatalogOperation useCatalogOperation = (UseCatalogOperation) operation; catalogManager.setCurrentCatalog(useCatalogOperation.getCatalogName()); return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof UseDatabaseOperation) { UseDatabaseOperation useDatabaseOperation = (UseDatabaseOperation) operation; catalogManager.setCurrentCatalog(useDatabaseOperation.getCatalogName()); catalogManager.setCurrentDatabase(useDatabaseOperation.getDatabaseName()); return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof ShowCatalogsOperation) { return buildShowResult("catalog name", listCatalogs()); } else if (operation instanceof ShowCurrentCatalogOperation) { return buildShowResult( "current catalog name", new String[] {catalogManager.getCurrentCatalog()}); } else if (operation instanceof ShowDatabasesOperation) { return buildShowResult("database name", listDatabases()); } else if (operation instanceof ShowCurrentDatabaseOperation) { return buildShowResult( "current database name", new String[] {catalogManager.getCurrentDatabase()}); } else if (operation instanceof ShowModulesOperation) { ShowModulesOperation showModulesOperation = (ShowModulesOperation) operation; if (showModulesOperation.requireFull()) { return buildShowFullModulesResult(listFullModules()); } else { return buildShowResult("module name", listModules()); } } else if (operation instanceof ShowTablesOperation) { return buildShowResult("table name", listTables()); } else if (operation instanceof ShowFunctionsOperation) { ShowFunctionsOperation showFunctionsOperation = (ShowFunctionsOperation) operation; String[] functionNames = null; ShowFunctionsOperation.FunctionScope functionScope = showFunctionsOperation.getFunctionScope(); switch (functionScope) { case USER: functionNames = listUserDefinedFunctions(); break; case ALL: functionNames = listFunctions(); break; default: throw new UnsupportedOperationException( String.format( "SHOW FUNCTIONS with %s scope is not supported.", functionScope)); } return buildShowResult("function name", functionNames); } else if (operation instanceof ShowViewsOperation) { return buildShowResult("view name", listViews()); } else if (operation instanceof ShowPartitionsOperation) { String exMsg = getDDLOpExecuteErrorMsg(operation.asSummaryString()); try { ShowPartitionsOperation showPartitionsOperation = (ShowPartitionsOperation) operation; Catalog catalog = getCatalogOrThrowException( showPartitionsOperation.getTableIdentifier().getCatalogName()); ObjectPath tablePath = showPartitionsOperation.getTableIdentifier().toObjectPath(); CatalogPartitionSpec partitionSpec = showPartitionsOperation.getPartitionSpec(); List partitionSpecs = partitionSpec == null ? catalog.listPartitions(tablePath) : catalog.listPartitions(tablePath, partitionSpec); List partitionNames = new ArrayList<>(partitionSpecs.size()); for (CatalogPartitionSpec spec : partitionSpecs) { List partitionKVs = new ArrayList<>(spec.getPartitionSpec().size()); for (Map.Entry partitionKV : spec.getPartitionSpec().entrySet()) { partitionKVs.add(partitionKV.getKey() + "=" + partitionKV.getValue()); } partitionNames.add(String.join("/", partitionKVs)); } return buildShowResult("partition name", partitionNames.toArray(new String[0])); } catch (TableNotExistException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof ExplainOperation) { String explanation = explainInternal( Collections.singletonList(((ExplainOperation) operation).getChild())); return TableResultImpl.builder() .resultKind(ResultKind.SUCCESS_WITH_CONTENT) .schema(ResolvedSchema.of(Column.physical("result", DataTypes.STRING()))) .data(Collections.singletonList(Row.of(explanation))) .setPrintStyle(TableResultImpl.PrintStyle.rawContent()) .setSessionTimeZone(getConfig().getLocalTimeZone()) .build(); } else if (operation instanceof DescribeTableOperation) { DescribeTableOperation describeTableOperation = (DescribeTableOperation) operation; Optional result = catalogManager.getTable(describeTableOperation.getSqlIdentifier()); if (result.isPresent()) { return buildDescribeResult(result.get().getResolvedSchema()); } else { throw new ValidationException( String.format( "Tables or views with the identifier '%s' doesn't exist", describeTableOperation.getSqlIdentifier().asSummaryString())); } } else if (operation instanceof QueryOperation) { return executeQueryOperation((QueryOperation) operation); } else if (operation instanceof CreateTableASOperation) { executeInternal(((CreateTableASOperation) operation).getCreateTableOperation()); return executeInternal(((CreateTableASOperation) operation).getInsertOperation()); } else if (operation instanceof NopOperation) { return TableResultImpl.TABLE_RESULT_OK; } else { throw new TableException(UNSUPPORTED_QUERY_IN_EXECUTE_SQL_MSG); } } private TableResult createCatalog(CreateCatalogOperation operation) { String exMsg = getDDLOpExecuteErrorMsg(operation.asSummaryString()); try { String catalogName = operation.getCatalogName(); Map properties = operation.getProperties(); Catalog catalog = FactoryUtil.createCatalog( catalogName, properties, tableConfig.getConfiguration(), userClassLoader); catalogManager.registerCatalog(catalogName, catalog); return TableResultImpl.TABLE_RESULT_OK; } catch (CatalogException e) { throw new ValidationException(exMsg, e); } } private TableResult loadModule(LoadModuleOperation operation) { String exMsg = getDDLOpExecuteErrorMsg(operation.asSummaryString()); try { // find module by name Map properties = new HashMap<>(operation.getProperties()); if (properties.containsKey(MODULE_TYPE)) { throw new ValidationException( String.format( "Property 'type' = '%s' is not supported since module name " + "is used to find module", properties.get(MODULE_TYPE))); } properties.put(MODULE_TYPE, operation.getModuleName()); final ModuleFactory factory = TableFactoryService.find(ModuleFactory.class, properties, userClassLoader); moduleManager.loadModule(operation.getModuleName(), factory.createModule(properties)); return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw new ValidationException(String.format("%s. %s", exMsg, e.getMessage()), e); } catch (Exception e) { throw new TableException(String.format("%s. %s", exMsg, e.getMessage()), e); } } private TableResult unloadModule(UnloadModuleOperation operation) { String exMsg = getDDLOpExecuteErrorMsg(operation.asSummaryString()); try { moduleManager.unloadModule(operation.getModuleName()); return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw new ValidationException(String.format("%s. %s", exMsg, e.getMessage()), e); } } private TableResult useModules(UseModulesOperation operation) { String exMsg = getDDLOpExecuteErrorMsg(operation.asSummaryString()); try { moduleManager.useModules(operation.getModuleNames().toArray(new String[0])); return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw new ValidationException(String.format("%s. %s", exMsg, e.getMessage()), e); } } private TableResult buildShowResult(String columnName, String[] objects) { return buildResult( new String[] {columnName}, new DataType[] {DataTypes.STRING()}, Arrays.stream(objects).map((c) -> new String[] {c}).toArray(String[][]::new)); } private TableResult buildShowFullModulesResult(ModuleEntry[] moduleEntries) { Object[][] rows = Arrays.stream(moduleEntries) .map(entry -> new Object[] {entry.name(), entry.used()}) .toArray(Object[][]::new); return buildResult( new String[] {"module name", "used"}, new DataType[] {DataTypes.STRING(), DataTypes.BOOLEAN()}, rows); } private TableResult buildDescribeResult(ResolvedSchema schema) { Map fieldToWatermark = schema.getWatermarkSpecs().stream() .collect( Collectors.toMap( WatermarkSpec::getRowtimeAttribute, spec -> spec.getWatermarkExpression().asSummaryString())); Map fieldToPrimaryKey = new HashMap<>(); schema.getPrimaryKey() .ifPresent( (p) -> { List columns = p.getColumns(); columns.forEach( (c) -> fieldToPrimaryKey.put( c, String.format( "PRI(%s)", String.join(", ", columns)))); }); Object[][] rows = schema.getColumns().stream() .map( (c) -> { final LogicalType logicalType = c.getDataType().getLogicalType(); return new Object[] { c.getName(), logicalType.copy(true).asSummaryString(), logicalType.isNullable(), fieldToPrimaryKey.getOrDefault(c.getName(), null), c.explainExtras().orElse(null), fieldToWatermark.getOrDefault(c.getName(), null) }; }) .toArray(Object[][]::new); return buildResult( new String[] {"name", "type", "null", "key", "extras", "watermark"}, new DataType[] { DataTypes.STRING(), DataTypes.STRING(), DataTypes.BOOLEAN(), DataTypes.STRING(), DataTypes.STRING(), DataTypes.STRING() }, rows); } private TableResult buildResult(String[] headers, DataType[] types, Object[][] rows) { return TableResultImpl.builder() .resultKind(ResultKind.SUCCESS_WITH_CONTENT) .schema(ResolvedSchema.physical(headers, types)) .data(Arrays.stream(rows).map(Row::of).collect(Collectors.toList())) .setPrintStyle( TableResultImpl.PrintStyle.tableau(Integer.MAX_VALUE, "", false, false)) .setSessionTimeZone(getConfig().getLocalTimeZone()) .build(); } /** * extract sink identifier names from {@link ModifyOperation}s. * *

If there are multiple ModifyOperations have same name, an index suffix will be added at * the end of the name to ensure each name is unique. */ private List extractSinkIdentifierNames(List operations) { List tableNames = new ArrayList<>(operations.size()); Map tableNameToCount = new HashMap<>(); for (ModifyOperation operation : operations) { if (operation instanceof CatalogSinkModifyOperation) { ObjectIdentifier identifier = ((CatalogSinkModifyOperation) operation).getTableIdentifier(); String fullName = identifier.asSummaryString(); tableNames.add(fullName); tableNameToCount.put(fullName, tableNameToCount.getOrDefault(fullName, 0) + 1); } else { throw new UnsupportedOperationException("Unsupported operation: " + operation); } } Map tableNameToIndex = new HashMap<>(); return tableNames.stream() .map( tableName -> { if (tableNameToCount.get(tableName) == 1) { return tableName; } else { Integer index = tableNameToIndex.getOrDefault(tableName, 0) + 1; tableNameToIndex.put(tableName, index); return tableName + "_" + index; } }) .collect(Collectors.toList()); } private String getJobName(String defaultJobName) { return tableConfig.getConfiguration().getString(PipelineOptions.NAME, defaultJobName); } /** Get catalog from catalogName or throw a ValidationException if the catalog not exists. */ private Catalog getCatalogOrThrowException(String catalogName) { return getCatalog(catalogName) .orElseThrow( () -> new ValidationException( String.format("Catalog %s does not exist", catalogName))); } private String getDDLOpExecuteErrorMsg(String action) { return String.format("Could not execute %s", action); } @Override public String getCurrentCatalog() { return catalogManager.getCurrentCatalog(); } @Override public void useCatalog(String catalogName) { catalogManager.setCurrentCatalog(catalogName); } @Override public String getCurrentDatabase() { return catalogManager.getCurrentDatabase(); } @Override public void useDatabase(String databaseName) { catalogManager.setCurrentDatabase(databaseName); } @Override public TableConfig getConfig() { return tableConfig; } @Override public JobExecutionResult execute(String jobName) throws Exception { Pipeline pipeline = execEnv.createPipeline(translateAndClearBuffer(), tableConfig, jobName); return execEnv.execute(pipeline); } @Override public Parser getParser() { return getPlanner().getParser(); } @Override public CatalogManager getCatalogManager() { return catalogManager; } @Override public OperationTreeBuilder getOperationTreeBuilder() { return operationTreeBuilder; } /** * Subclasses can override this method to transform the given QueryOperation to a new one with * the qualified object identifier. This is needed for some QueryOperations, e.g. * JavaDataStreamQueryOperation, which doesn't know the registered identifier when created * ({@code fromDataStream(DataStream)}. But the identifier is required when converting this * QueryOperation to RelNode. */ protected QueryOperation qualifyQueryOperation( ObjectIdentifier identifier, QueryOperation queryOperation) { return queryOperation; } /** * Subclasses can override this method to add additional checks. * * @param tableSource tableSource to validate */ protected void validateTableSource(TableSource tableSource) { TableSourceValidation.validateTableSource(tableSource, tableSource.getTableSchema()); } /** * Translate the buffered operations to Transformations, and clear the buffer. * *

The buffer will be clear even if the `translate` fails. In most cases, the failure is not * retryable (e.g. type mismatch, can't generate physical plan). If the buffer is not clear * after failure, the following `translate` will also fail. */ protected List> translateAndClearBuffer() { List> transformations; try { transformations = translate(bufferedModifyOperations); } finally { bufferedModifyOperations.clear(); } return transformations; } private List> translate(List modifyOperations) { return planner.translate(modifyOperations); } private void buffer(List modifyOperations) { bufferedModifyOperations.addAll(modifyOperations); } @VisibleForTesting protected ExplainDetail[] getExplainDetails(boolean extended) { if (extended) { if (isStreamingMode) { return new ExplainDetail[] { ExplainDetail.ESTIMATED_COST, ExplainDetail.CHANGELOG_MODE }; } else { return new ExplainDetail[] {ExplainDetail.ESTIMATED_COST}; } } else { return new ExplainDetail[0]; } } @Override public void registerTableSourceInternal(String name, TableSource tableSource) { validateTableSource(tableSource); ObjectIdentifier objectIdentifier = catalogManager.qualifyIdentifier(UnresolvedIdentifier.of(name)); Optional table = getTemporaryTable(objectIdentifier); if (table.isPresent()) { if (table.get() instanceof ConnectorCatalogTable) { ConnectorCatalogTable sourceSinkTable = (ConnectorCatalogTable) table.get(); if (sourceSinkTable.getTableSource().isPresent()) { throw new ValidationException( String.format( "Table '%s' already exists. Please choose a different name.", name)); } else { // wrapper contains only sink (not source) ConnectorCatalogTable sourceAndSink = ConnectorCatalogTable.sourceAndSink( tableSource, sourceSinkTable.getTableSink().get(), !IS_STREAM_TABLE); catalogManager.dropTemporaryTable(objectIdentifier, false); catalogManager.createTemporaryTable(sourceAndSink, objectIdentifier, false); } } else { throw new ValidationException( String.format( "Table '%s' already exists. Please choose a different name.", name)); } } else { ConnectorCatalogTable source = ConnectorCatalogTable.source(tableSource, !IS_STREAM_TABLE); catalogManager.createTemporaryTable(source, objectIdentifier, false); } } @Override public void registerTableSinkInternal(String name, TableSink tableSink) { ObjectIdentifier objectIdentifier = catalogManager.qualifyIdentifier(UnresolvedIdentifier.of(name)); Optional table = getTemporaryTable(objectIdentifier); if (table.isPresent()) { if (table.get() instanceof ConnectorCatalogTable) { ConnectorCatalogTable sourceSinkTable = (ConnectorCatalogTable) table.get(); if (sourceSinkTable.getTableSink().isPresent()) { throw new ValidationException( String.format( "Table '%s' already exists. Please choose a different name.", name)); } else { // wrapper contains only sink (not source) ConnectorCatalogTable sourceAndSink = ConnectorCatalogTable.sourceAndSink( sourceSinkTable.getTableSource().get(), tableSink, !IS_STREAM_TABLE); catalogManager.dropTemporaryTable(objectIdentifier, false); catalogManager.createTemporaryTable(sourceAndSink, objectIdentifier, false); } } else { throw new ValidationException( String.format( "Table '%s' already exists. Please choose a different name.", name)); } } else { ConnectorCatalogTable sink = ConnectorCatalogTable.sink(tableSink, !IS_STREAM_TABLE); catalogManager.createTemporaryTable(sink, objectIdentifier, false); } } private Optional getTemporaryTable(ObjectIdentifier identifier) { return catalogManager .getTable(identifier) .filter(CatalogManager.TableLookupResult::isTemporary) .map(CatalogManager.TableLookupResult::getTable); } private TableResult createCatalogFunction( CreateCatalogFunctionOperation createCatalogFunctionOperation) { String exMsg = getDDLOpExecuteErrorMsg(createCatalogFunctionOperation.asSummaryString()); try { if (createCatalogFunctionOperation.isTemporary()) { functionCatalog.registerTemporaryCatalogFunction( UnresolvedIdentifier.of( createCatalogFunctionOperation.getFunctionIdentifier().toList()), createCatalogFunctionOperation.getCatalogFunction(), createCatalogFunctionOperation.isIgnoreIfExists()); } else { Catalog catalog = getCatalogOrThrowException( createCatalogFunctionOperation .getFunctionIdentifier() .getCatalogName()); catalog.createFunction( createCatalogFunctionOperation.getFunctionIdentifier().toObjectPath(), createCatalogFunctionOperation.getCatalogFunction(), createCatalogFunctionOperation.isIgnoreIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw e; } catch (FunctionAlreadyExistException e) { throw new ValidationException(e.getMessage(), e); } catch (Exception e) { throw new TableException(exMsg, e); } } private TableResult alterCatalogFunction( AlterCatalogFunctionOperation alterCatalogFunctionOperation) { String exMsg = getDDLOpExecuteErrorMsg(alterCatalogFunctionOperation.asSummaryString()); try { CatalogFunction function = alterCatalogFunctionOperation.getCatalogFunction(); if (alterCatalogFunctionOperation.isTemporary()) { throw new ValidationException("Alter temporary catalog function is not supported"); } else { Catalog catalog = getCatalogOrThrowException( alterCatalogFunctionOperation .getFunctionIdentifier() .getCatalogName()); catalog.alterFunction( alterCatalogFunctionOperation.getFunctionIdentifier().toObjectPath(), function, alterCatalogFunctionOperation.isIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw e; } catch (FunctionNotExistException e) { throw new ValidationException(e.getMessage(), e); } catch (Exception e) { throw new TableException(exMsg, e); } } private TableResult dropCatalogFunction( DropCatalogFunctionOperation dropCatalogFunctionOperation) { String exMsg = getDDLOpExecuteErrorMsg(dropCatalogFunctionOperation.asSummaryString()); try { if (dropCatalogFunctionOperation.isTemporary()) { functionCatalog.dropTempCatalogFunction( dropCatalogFunctionOperation.getFunctionIdentifier(), dropCatalogFunctionOperation.isIfExists()); } else { Catalog catalog = getCatalogOrThrowException( dropCatalogFunctionOperation .getFunctionIdentifier() .getCatalogName()); catalog.dropFunction( dropCatalogFunctionOperation.getFunctionIdentifier().toObjectPath(), dropCatalogFunctionOperation.isIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw e; } catch (FunctionNotExistException e) { throw new ValidationException(e.getMessage(), e); } catch (Exception e) { throw new TableException(exMsg, e); } } private TableResult createSystemFunction(CreateTempSystemFunctionOperation operation) { String exMsg = getDDLOpExecuteErrorMsg(operation.asSummaryString()); try { functionCatalog.registerTemporarySystemFunction( operation.getFunctionName(), operation.getCatalogFunction(), operation.isIgnoreIfExists()); return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw e; } catch (Exception e) { throw new TableException(exMsg, e); } } private TableResult dropSystemFunction(DropTempSystemFunctionOperation operation) { try { functionCatalog.dropTemporarySystemFunction( operation.getFunctionName(), operation.isIfExists()); return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw e; } catch (Exception e) { throw new TableException(getDDLOpExecuteErrorMsg(operation.asSummaryString()), e); } } protected TableImpl createTable(QueryOperation tableOperation) { return TableImpl.createTable( this, tableOperation, operationTreeBuilder, functionCatalog.asLookup(getParser()::parseIdentifier)); } @Override public String getJsonPlan(String stmt) { List operations = getParser().parse(stmt); if (operations.size() != 1) { throw new TableException( "Unsupported SQL query! getJsonPlan() only accepts a single INSERT statement."); } Operation operation = operations.get(0); List modifyOperations = new ArrayList<>(1); if (operation instanceof ModifyOperation) { modifyOperations.add((ModifyOperation) operation); } else { throw new TableException("Only INSERT is supported now."); } return getJsonPlan(modifyOperations); } @Override public String getJsonPlan(List operations) { return planner.getJsonPlan(operations); } @Override public String explainJsonPlan(String jsonPlan, ExplainDetail... extraDetails) { return planner.explainJsonPlan(jsonPlan, extraDetails); } @Override public TableResult executeJsonPlan(String jsonPlan) { List> transformations = planner.translateJsonPlan(jsonPlan); List sinkIdentifierNames = new ArrayList<>(); for (int i = 0; i < transformations.size(); ++i) { // TODO serialize the sink table names to json plan ? sinkIdentifierNames.add("sink" + i); } return executeInternal(transformations, sinkIdentifierNames); } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.13/org/apache/flink/util/ExceptionUtils.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // // The function "stringifyException" is based on source code from the Hadoop Project // (http://hadoop.apache.org/), // licensed by the Apache Software Foundation (ASF) under the Apache License, Version 2.0. // See the NOTICE file distributed with this work for additional information regarding copyright // ownership. // package org.apache.flink.util; import com.zto.fire.common.util.ExceptionBus; import org.apache.flink.annotation.Internal; import org.apache.flink.util.function.RunnableWithException; import javax.annotation.Nullable; import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; import java.lang.reflect.Field; import java.util.Optional; import java.util.concurrent.CompletionException; import java.util.concurrent.ExecutionException; import java.util.function.Function; import java.util.function.Predicate; import static org.apache.flink.util.Preconditions.checkNotNull; /** A collection of utility functions for dealing with exceptions and exception workflows. */ @Internal public final class ExceptionUtils { /** The stringified representation of a null exception reference. */ public static final String STRINGIFIED_NULL_EXCEPTION = "(null)"; // TODO: ------------ start:二次开发代码 --------------- // /** * Makes a string representation of the exception's stack trace, or "(null)", if the exception * is null. * *

This method makes a best effort and never fails. * * @param e The exception to stringify. * @return A string with exception name and call stack. */ public static String stringifyException(final Throwable e) { return stringifyException(e, ""); } /** * Makes a string representation of the exception's stack trace, or "(null)", if the exception * is null. * *

This method makes a best effort and never fails. * * @param e The exception to stringify. * @return A string with exception name and call stack. */ public static String stringifyException(final Throwable e, String sql) { if (e == null) { return STRINGIFIED_NULL_EXCEPTION; } try { StringWriter stm = new StringWriter(); PrintWriter wrt = new PrintWriter(stm); e.printStackTrace(wrt); wrt.close(); ExceptionBus.post(e, sql); return stm.toString(); } catch (Throwable t) { return e.getClass().getName() + " (error while printing stack trace)"; } } // TODO: ------------ end:二次开发代码 --------------- // /** * Checks whether the given exception indicates a situation that may leave the JVM in a * corrupted state, meaning a state where continued normal operation can only be guaranteed via * clean process restart. * *

Currently considered fatal exceptions are Virtual Machine errors indicating that the JVM * is corrupted, like {@link InternalError}, {@link UnknownError}, and {@link * java.util.zip.ZipError} (a special case of InternalError). The {@link ThreadDeath} exception * is also treated as a fatal error, because when a thread is forcefully stopped, there is a * high chance that parts of the system are in an inconsistent state. * * @param t The exception to check. * @return True, if the exception is considered fatal to the JVM, false otherwise. */ public static boolean isJvmFatalError(Throwable t) { return (t instanceof InternalError) || (t instanceof UnknownError) || (t instanceof ThreadDeath); } /** * Checks whether the given exception indicates a situation that may leave the JVM in a * corrupted state, or an out-of-memory error. * *

See {@link ExceptionUtils#isJvmFatalError(Throwable)} for a list of fatal JVM errors. This * method additionally classifies the {@link OutOfMemoryError} as fatal, because it may occur in * any thread (not the one that allocated the majority of the memory) and thus is often not * recoverable by destroying the particular thread that threw the exception. * * @param t The exception to check. * @return True, if the exception is fatal to the JVM or and OutOfMemoryError, false otherwise. */ public static boolean isJvmFatalOrOutOfMemoryError(Throwable t) { return isJvmFatalError(t) || t instanceof OutOfMemoryError; } /** * Tries to enrich OutOfMemoryErrors being part of the passed root Throwable's cause tree. * *

This method improves error messages for direct and metaspace {@link OutOfMemoryError}. It * adds description about the possible causes and ways of resolution. * * @param root The Throwable of which the cause tree shall be traversed. * @param jvmMetaspaceOomNewErrorMessage The message being used for JVM metaspace-related * OutOfMemoryErrors. Passing null will disable handling this class of error. * @param jvmDirectOomNewErrorMessage The message being used for direct memory-related * OutOfMemoryErrors. Passing null will disable handling this class of error. * @param jvmHeapSpaceOomNewErrorMessage The message being used for Heap space-related * OutOfMemoryErrors. Passing null will disable handling this class of error. */ public static void tryEnrichOutOfMemoryError( @Nullable Throwable root, @Nullable String jvmMetaspaceOomNewErrorMessage, @Nullable String jvmDirectOomNewErrorMessage, @Nullable String jvmHeapSpaceOomNewErrorMessage) { updateDetailMessage( root, t -> { if (isMetaspaceOutOfMemoryError(t)) { return jvmMetaspaceOomNewErrorMessage; } else if (isDirectOutOfMemoryError(t)) { return jvmDirectOomNewErrorMessage; } else if (isHeapSpaceOutOfMemoryError(t)) { return jvmHeapSpaceOomNewErrorMessage; } return null; }); } /** * Updates error messages of Throwables appearing in the cause tree of the passed root * Throwable. The passed Function is applied on each Throwable of the cause tree. Returning a * String will cause the detailMessage of the corresponding Throwable to be updated. Returning * null, instead, won't trigger any detailMessage update on that Throwable. * * @param root The Throwable whose cause tree shall be traversed. * @param throwableToMessage The Function based on which the new messages are generated. The * function implementation should return the new message. Returning null, in * contrast, will result in not updating the message for the corresponding Throwable. */ public static void updateDetailMessage( @Nullable Throwable root, @Nullable Function throwableToMessage) { if (throwableToMessage == null) { return; } Throwable it = root; while (it != null) { String newMessage = throwableToMessage.apply(it); if (newMessage != null) { updateDetailMessageOfThrowable(it, newMessage); } it = it.getCause(); } } private static void updateDetailMessageOfThrowable( Throwable throwable, String newDetailMessage) { Field field; try { field = Throwable.class.getDeclaredField("detailMessage"); } catch (NoSuchFieldException e) { throw new IllegalStateException( "The JDK Throwable contains a detailMessage member. The Throwable class provided on the classpath does not which is why this exception appears.", e); } field.setAccessible(true); try { field.set(throwable, newDetailMessage); } catch (IllegalAccessException e) { throw new IllegalStateException( "The JDK Throwable contains a private detailMessage member that should be accessible through reflection. This is not the case for the Throwable class provided on the classpath.", e); } } /** * Checks whether the given exception indicates a JVM metaspace out-of-memory error. * * @param t The exception to check. * @return True, if the exception is the metaspace {@link OutOfMemoryError}, false otherwise. */ public static boolean isMetaspaceOutOfMemoryError(@Nullable Throwable t) { return isOutOfMemoryErrorWithMessageStartingWith(t, "Metaspace"); } /** * Checks whether the given exception indicates a JVM direct out-of-memory error. * * @param t The exception to check. * @return True, if the exception is the direct {@link OutOfMemoryError}, false otherwise. */ public static boolean isDirectOutOfMemoryError(@Nullable Throwable t) { return isOutOfMemoryErrorWithMessageStartingWith(t, "Direct buffer memory"); } public static boolean isHeapSpaceOutOfMemoryError(@Nullable Throwable t) { return isOutOfMemoryErrorWithMessageStartingWith(t, "Java heap space"); } private static boolean isOutOfMemoryErrorWithMessageStartingWith( @Nullable Throwable t, String prefix) { // the exact matching of the class is checked to avoid matching any custom subclasses of // OutOfMemoryError // as we are interested in the original exceptions, generated by JVM. return isOutOfMemoryError(t) && t.getMessage() != null && t.getMessage().startsWith(prefix); } private static boolean isOutOfMemoryError(@Nullable Throwable t) { return t != null && t.getClass() == OutOfMemoryError.class; } /** * Rethrows the given {@code Throwable}, if it represents an error that is fatal to the JVM. See * {@link ExceptionUtils#isJvmFatalError(Throwable)} for a definition of fatal errors. * * @param t The Throwable to check and rethrow. */ public static void rethrowIfFatalError(Throwable t) { if (isJvmFatalError(t)) { throw (Error) t; } } /** * Rethrows the given {@code Throwable}, if it represents an error that is fatal to the JVM or * an out-of-memory error. See {@link ExceptionUtils#isJvmFatalError(Throwable)} for a * definition of fatal errors. * * @param t The Throwable to check and rethrow. */ public static void rethrowIfFatalErrorOrOOM(Throwable t) { if (isJvmFatalError(t) || t instanceof OutOfMemoryError) { throw (Error) t; } } /** * Adds a new exception as a {@link Throwable#addSuppressed(Throwable) suppressed exception} to * a prior exception, or returns the new exception, if no prior exception exists. * *

{@code
     * public void closeAllThings() throws Exception {
     *     Exception ex = null;
     *     try {
     *         component.shutdown();
     *     } catch (Exception e) {
     *         ex = firstOrSuppressed(e, ex);
     *     }
     *     try {
     *         anotherComponent.stop();
     *     } catch (Exception e) {
     *         ex = firstOrSuppressed(e, ex);
     *     }
     *     try {
     *         lastComponent.shutdown();
     *     } catch (Exception e) {
     *         ex = firstOrSuppressed(e, ex);
     *     }
     *
     *     if (ex != null) {
     *         throw ex;
     *     }
     * }
     * }
* * @param newException The newly occurred exception * @param previous The previously occurred exception, possibly null. * @return The new exception, if no previous exception exists, or the previous exception with * the new exception in the list of suppressed exceptions. */ public static T firstOrSuppressed(T newException, @Nullable T previous) { checkNotNull(newException, "newException"); if (previous == null || previous == newException) { return newException; } else { previous.addSuppressed(newException); return previous; } } /** * Throws the given {@code Throwable} in scenarios where the signatures do not allow you to * throw an arbitrary Throwable. Errors and RuntimeExceptions are thrown directly, other * exceptions are packed into runtime exceptions * * @param t The throwable to be thrown. */ public static void rethrow(Throwable t) { if (t instanceof Error) { throw (Error) t; } else if (t instanceof RuntimeException) { throw (RuntimeException) t; } else { throw new RuntimeException(t); } } /** * Throws the given {@code Throwable} in scenarios where the signatures do not allow you to * throw an arbitrary Throwable. Errors and RuntimeExceptions are thrown directly, other * exceptions are packed into a parent RuntimeException. * * @param t The throwable to be thrown. * @param parentMessage The message for the parent RuntimeException, if one is needed. */ public static void rethrow(Throwable t, String parentMessage) { if (t instanceof Error) { throw (Error) t; } else if (t instanceof RuntimeException) { throw (RuntimeException) t; } else { throw new RuntimeException(parentMessage, t); } } /** * Throws the given {@code Throwable} in scenarios where the signatures do allow to throw a * Exception. Errors and Exceptions are thrown directly, other "exotic" subclasses of Throwable * are wrapped in an Exception. * * @param t The throwable to be thrown. * @param parentMessage The message for the parent Exception, if one is needed. */ public static void rethrowException(Throwable t, String parentMessage) throws Exception { if (t instanceof Error) { throw (Error) t; } else if (t instanceof Exception) { throw (Exception) t; } else { throw new Exception(parentMessage, t); } } /** * Throws the given {@code Throwable} in scenarios where the signatures do allow to throw a * Exception. Errors and Exceptions are thrown directly, other "exotic" subclasses of Throwable * are wrapped in an Exception. * * @param t The throwable to be thrown. */ public static void rethrowException(Throwable t) throws Exception { if (t instanceof Error) { throw (Error) t; } else if (t instanceof Exception) { throw (Exception) t; } else { throw new Exception(t.getMessage(), t); } } /** * Tries to throw the given exception if not null. * * @param e exception to throw if not null. * @throws Exception */ public static void tryRethrowException(@Nullable Exception e) throws Exception { if (e != null) { throw e; } } /** * Tries to throw the given {@code Throwable} in scenarios where the signatures allows only * IOExceptions (and RuntimeException and Error). Throws this exception directly, if it is an * IOException, a RuntimeException, or an Error. Otherwise does nothing. * * @param t The Throwable to be thrown. */ public static void tryRethrowIOException(Throwable t) throws IOException { if (t instanceof IOException) { throw (IOException) t; } else if (t instanceof RuntimeException) { throw (RuntimeException) t; } else if (t instanceof Error) { throw (Error) t; } } /** * Re-throws the given {@code Throwable} in scenarios where the signatures allows only * IOExceptions (and RuntimeException and Error). * *

Throws this exception directly, if it is an IOException, a RuntimeException, or an Error. * Otherwise it wraps it in an IOException and throws it. * * @param t The Throwable to be thrown. */ public static void rethrowIOException(Throwable t) throws IOException { if (t instanceof IOException) { throw (IOException) t; } else if (t instanceof RuntimeException) { throw (RuntimeException) t; } else if (t instanceof Error) { throw (Error) t; } else { throw new IOException(t.getMessage(), t); } } /** * Checks whether a throwable chain contains a specific type of exception and returns it. It * deserializes any {@link SerializedThrowable} that are found using the provided {@link * ClassLoader}. * * @param throwable the throwable chain to check. * @param searchType the type of exception to search for in the chain. * @param classLoader to use for deserialization. * @return Optional throwable of the requested type if available, otherwise empty */ public static Optional findSerializedThrowable( Throwable throwable, Class searchType, ClassLoader classLoader) { if (throwable == null || searchType == null) { return Optional.empty(); } Throwable t = throwable; while (t != null) { if (searchType.isAssignableFrom(t.getClass())) { return Optional.of(searchType.cast(t)); } else if (t.getClass().isAssignableFrom(SerializedThrowable.class)) { Throwable next = ((SerializedThrowable) t).deserializeError(classLoader); // SerializedThrowable#deserializeError returns itself under some conditions (e.g., // null cause). // If that happens, exit to avoid looping infinitely. This is ok because if the user // was searching // for a SerializedThrowable, we would have returned it in the initial if condition. t = (next == t) ? null : next; } else { t = t.getCause(); } } return Optional.empty(); } /** * Checks whether a throwable chain contains a specific type of exception and returns it. * * @param throwable the throwable chain to check. * @param searchType the type of exception to search for in the chain. * @return Optional throwable of the requested type if available, otherwise empty */ public static Optional findThrowable( Throwable throwable, Class searchType) { if (throwable == null || searchType == null) { return Optional.empty(); } // TODO: ------------ start:二次开发代码 --------------- // ExceptionBus.post(throwable, ""); // TODO: ------------ end:二次开发代码 --------------- // Throwable t = throwable; while (t != null) { if (searchType.isAssignableFrom(t.getClass())) { return Optional.of(searchType.cast(t)); } else { t = t.getCause(); } } return Optional.empty(); } /** * Checks whether a throwable chain contains a specific type of exception and returns it. This * method handles {@link SerializedThrowable}s in the chain and deserializes them with the given * ClassLoader. * *

SerializedThrowables are often used when exceptions might come from dynamically loaded * code and be transported over RPC / HTTP for better error reporting. The receiving processes * or threads might not have the dynamically loaded code available. * * @param throwable the throwable chain to check. * @param searchType the type of exception to search for in the chain. * @param classLoader the ClassLoader to use when encountering a SerializedThrowable. * @return Optional throwable of the requested type if available, otherwise empty */ public static Optional findThrowableSerializedAware( Throwable throwable, Class searchType, ClassLoader classLoader) { if (throwable == null || searchType == null) { return Optional.empty(); } Throwable t = throwable; while (t != null) { if (searchType.isAssignableFrom(t.getClass())) { return Optional.of(searchType.cast(t)); } else if (t instanceof SerializedThrowable) { t = ((SerializedThrowable) t).deserializeError(classLoader); } else { t = t.getCause(); } } return Optional.empty(); } /** * Checks whether a throwable chain contains an exception matching a predicate and returns it. * * @param throwable the throwable chain to check. * @param predicate the predicate of the exception to search for in the chain. * @return Optional throwable of the requested type if available, otherwise empty */ public static Optional findThrowable( Throwable throwable, Predicate predicate) { if (throwable == null || predicate == null) { return Optional.empty(); } Throwable t = throwable; while (t != null) { if (predicate.test(t)) { return Optional.of(t); } else { t = t.getCause(); } } return Optional.empty(); } /** * Checks whether a throwable chain contains a specific error message and returns the * corresponding throwable. * * @param throwable the throwable chain to check. * @param searchMessage the error message to search for in the chain. * @return Optional throwable containing the search message if available, otherwise empty */ public static Optional findThrowableWithMessage( Throwable throwable, String searchMessage) { if (throwable == null || searchMessage == null) { return Optional.empty(); } Throwable t = throwable; while (t != null) { if (t.getMessage() != null && t.getMessage().contains(searchMessage)) { return Optional.of(t); } else { t = t.getCause(); } } return Optional.empty(); } /** * Unpacks an {@link ExecutionException} and returns its cause. Otherwise the given Throwable is * returned. * * @param throwable to unpack if it is an ExecutionException * @return Cause of ExecutionException or given Throwable */ public static Throwable stripExecutionException(Throwable throwable) { return stripException(throwable, ExecutionException.class); } /** * Unpacks an {@link CompletionException} and returns its cause. Otherwise the given Throwable * is returned. * * @param throwable to unpack if it is an CompletionException * @return Cause of CompletionException or given Throwable */ public static Throwable stripCompletionException(Throwable throwable) { return stripException(throwable, CompletionException.class); } /** * Unpacks an specified exception and returns its cause. Otherwise the given {@link Throwable} * is returned. * * @param throwableToStrip to strip * @param typeToStrip type to strip * @return Unpacked cause or given Throwable if not packed */ public static Throwable stripException( Throwable throwableToStrip, Class typeToStrip) { while (typeToStrip.isAssignableFrom(throwableToStrip.getClass()) && throwableToStrip.getCause() != null) { throwableToStrip = throwableToStrip.getCause(); } return throwableToStrip; } /** * Tries to find a {@link SerializedThrowable} as the cause of the given throwable and throws * its deserialized value. If there is no such throwable, then the original throwable is thrown. * * @param throwable to check for a SerializedThrowable * @param classLoader to be used for the deserialization of the SerializedThrowable * @throws Throwable either the deserialized throwable or the given throwable */ public static void tryDeserializeAndThrow(Throwable throwable, ClassLoader classLoader) throws Throwable { Throwable current = throwable; while (!(current instanceof SerializedThrowable) && current.getCause() != null) { current = current.getCause(); } if (current instanceof SerializedThrowable) { throw ((SerializedThrowable) current).deserializeError(classLoader); } else { throw throwable; } } /** * Checks whether the given exception is a {@link InterruptedException} and sets the interrupted * flag accordingly. * * @param e to check whether it is an {@link InterruptedException} */ public static void checkInterrupted(Throwable e) { if (e instanceof InterruptedException) { Thread.currentThread().interrupt(); } } // ------------------------------------------------------------------------ // Lambda exception utilities // ------------------------------------------------------------------------ public static void suppressExceptions(RunnableWithException action) { try { action.run(); } catch (InterruptedException e) { // restore interrupted state Thread.currentThread().interrupt(); } catch (Throwable t) { if (isJvmFatalError(t)) { rethrow(t); } } } // ------------------------------------------------------------------------ /** Private constructor to prevent instantiation. */ private ExceptionUtils() {} } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.14/org/apache/flink/client/deployment/application/ApplicationDispatcherBootstrap.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.client.deployment.application; import org.apache.flink.annotation.Internal; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.JobID; import org.apache.flink.api.common.time.Time; import org.apache.flink.client.ClientUtils; import org.apache.flink.client.cli.ClientOptions; import org.apache.flink.client.deployment.application.executors.EmbeddedExecutor; import org.apache.flink.client.deployment.application.executors.EmbeddedExecutorServiceLoader; import org.apache.flink.client.program.PackagedProgram; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.HighAvailabilityOptions; import org.apache.flink.configuration.PipelineOptionsInternal; import org.apache.flink.core.execution.PipelineExecutorServiceLoader; import org.apache.flink.runtime.client.DuplicateJobSubmissionException; import org.apache.flink.runtime.clusterframework.ApplicationStatus; import org.apache.flink.runtime.dispatcher.DispatcherBootstrap; import org.apache.flink.runtime.dispatcher.DispatcherGateway; import org.apache.flink.runtime.jobgraph.JobGraph; import org.apache.flink.runtime.jobmanager.HighAvailabilityMode; import org.apache.flink.runtime.jobmaster.JobResult; import org.apache.flink.runtime.messages.Acknowledge; import org.apache.flink.runtime.messages.FlinkJobNotFoundException; import org.apache.flink.runtime.rpc.FatalErrorHandler; import org.apache.flink.util.ExceptionUtils; import org.apache.flink.util.concurrent.FutureUtils; import org.apache.flink.util.concurrent.ScheduledExecutor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.*; import java.util.concurrent.*; import java.util.function.Function; import java.util.stream.Collectors; import static org.apache.flink.util.Preconditions.checkNotNull; /** * A {@link DispatcherBootstrap} used for running the user's {@code main()} in "Application Mode" * (see FLIP-85). * *

This dispatcher bootstrap submits the recovered {@link JobGraph job graphs} for re-execution * (in case of recovery from a failure), and then submits the remaining jobs of the application for * execution. * *

To achieve this, it works in conjunction with the {@link EmbeddedExecutor EmbeddedExecutor} * which decides if it should submit a job for execution (in case of a new job) or the job was * already recovered and is running. */ @Internal public class ApplicationDispatcherBootstrap implements DispatcherBootstrap { private static final Logger LOG = LoggerFactory.getLogger(ApplicationDispatcherBootstrap.class); public static final JobID ZERO_JOB_ID = new JobID(0, 0); private final PackagedProgram application; private final Collection recoveredJobIds; private final Configuration configuration; private final FatalErrorHandler errorHandler; private final CompletableFuture applicationCompletionFuture; private final CompletableFuture clusterShutdownFuture; private ScheduledFuture applicationExecutionTask; public ApplicationDispatcherBootstrap( final PackagedProgram application, final Collection recoveredJobIds, final Configuration configuration, final DispatcherGateway dispatcherGateway, final ScheduledExecutor scheduledExecutor, final FatalErrorHandler errorHandler) { this.configuration = checkNotNull(configuration); this.recoveredJobIds = checkNotNull(recoveredJobIds); this.application = checkNotNull(application); this.errorHandler = checkNotNull(errorHandler); this.applicationCompletionFuture = fixJobIdAndRunApplicationAsync(dispatcherGateway, scheduledExecutor); this.clusterShutdownFuture = runApplicationAndShutdownClusterAsync(dispatcherGateway); } @Override public void stop() { if (applicationExecutionTask != null) { applicationExecutionTask.cancel(true); } if (applicationCompletionFuture != null) { applicationCompletionFuture.cancel(true); } } @VisibleForTesting ScheduledFuture getApplicationExecutionFuture() { return applicationExecutionTask; } @VisibleForTesting CompletableFuture getApplicationCompletionFuture() { return applicationCompletionFuture; } @VisibleForTesting CompletableFuture getClusterShutdownFuture() { return clusterShutdownFuture; } /** * Runs the user program entrypoint and shuts down the given dispatcherGateway when the * application completes (either successfully or in case of failure). */ private CompletableFuture runApplicationAndShutdownClusterAsync( final DispatcherGateway dispatcherGateway) { final CompletableFuture shutdownFuture = applicationCompletionFuture .handle( (ignored, t) -> { if (t == null) { LOG.info("Application completed SUCCESSFULLY"); return dispatcherGateway.shutDownCluster( ApplicationStatus.SUCCEEDED); } final Optional maybeException = ExceptionUtils.findThrowable( t, UnsuccessfulExecutionException.class); if (maybeException.isPresent()) { final ApplicationStatus applicationStatus = maybeException.get().getStatus(); if (applicationStatus == ApplicationStatus.CANCELED || applicationStatus == ApplicationStatus.FAILED) { LOG.info("Application {}: ", applicationStatus, t); return dispatcherGateway.shutDownCluster( applicationStatus); } } if (t instanceof CancellationException) { LOG.warn( "Application has been cancelled because the {} is being stopped.", ApplicationDispatcherBootstrap.class .getSimpleName()); return CompletableFuture.completedFuture(Acknowledge.get()); } LOG.warn("Application failed unexpectedly: ", t); return FutureUtils.completedExceptionally(t); }) .thenCompose(Function.identity()); FutureUtils.handleUncaughtException(shutdownFuture, (t, e) -> errorHandler.onFatalError(e)); return shutdownFuture; } private CompletableFuture fixJobIdAndRunApplicationAsync( final DispatcherGateway dispatcherGateway, final ScheduledExecutor scheduledExecutor) { final Optional configuredJobId = configuration.getOptional(PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID); if (!HighAvailabilityMode.isHighAvailabilityModeActivated(configuration) && !configuredJobId.isPresent()) { return runApplicationAsync(dispatcherGateway, scheduledExecutor, false); } // TODO: ------------ start:二次开发代码 --------------- // if (!configuredJobId.isPresent()) { String haClusterId = configuration.getString(HighAvailabilityOptions.HA_CLUSTER_ID); String[] splits = (haClusterId != null ? haClusterId : "").split("_"); if (splits != null && splits.length == 3) { configuration.set( PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID, new JobID(Long.valueOf(splits[1]), Long.valueOf(splits[2])).toHexString()); } else { configuration.set( PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID, ZERO_JOB_ID.toHexString()); } } // TODO: ------------ end:二次开发代码 --------------- // return runApplicationAsync(dispatcherGateway, scheduledExecutor, true); } /** * Runs the user program entrypoint by scheduling a task on the given {@code scheduledExecutor}. * The returned {@link CompletableFuture} completes when all jobs of the user application * succeeded. if any of them fails, or if job submission fails. */ private CompletableFuture runApplicationAsync( final DispatcherGateway dispatcherGateway, final ScheduledExecutor scheduledExecutor, final boolean enforceSingleJobExecution) { final CompletableFuture> applicationExecutionFuture = new CompletableFuture<>(); final Set tolerateMissingResult = Collections.synchronizedSet(new HashSet<>()); // we need to hand in a future as return value because we need to get those JobIs out // from the scheduled task that executes the user program applicationExecutionTask = scheduledExecutor.schedule( () -> runApplicationEntryPoint( applicationExecutionFuture, tolerateMissingResult, dispatcherGateway, scheduledExecutor, enforceSingleJobExecution), 0L, TimeUnit.MILLISECONDS); return applicationExecutionFuture.thenCompose( jobIds -> getApplicationResult( dispatcherGateway, jobIds, tolerateMissingResult, scheduledExecutor)); } /** * Runs the user program entrypoint and completes the given {@code jobIdsFuture} with the {@link * JobID JobIDs} of the submitted jobs. * *

This should be executed in a separate thread (or task). */ private void runApplicationEntryPoint( final CompletableFuture> jobIdsFuture, final Set tolerateMissingResult, final DispatcherGateway dispatcherGateway, final ScheduledExecutor scheduledExecutor, final boolean enforceSingleJobExecution) { try { final List applicationJobIds = new ArrayList<>(recoveredJobIds); final PipelineExecutorServiceLoader executorServiceLoader = new EmbeddedExecutorServiceLoader( applicationJobIds, dispatcherGateway, scheduledExecutor); ClientUtils.executeProgram( executorServiceLoader, configuration, application, enforceSingleJobExecution, true /* suppress sysout */); if (applicationJobIds.isEmpty()) { jobIdsFuture.completeExceptionally( new ApplicationExecutionException( "The application contains no execute() calls.")); } else { jobIdsFuture.complete(applicationJobIds); } } catch (Throwable t) { // If we're running in a single job execution mode, it's safe to consider re-submission // of an already finished a success. final Optional maybeDuplicate = ExceptionUtils.findThrowable(t, DuplicateJobSubmissionException.class); if (enforceSingleJobExecution && maybeDuplicate.isPresent() && maybeDuplicate.get().isGloballyTerminated()) { final JobID jobId = maybeDuplicate.get().getJobID(); tolerateMissingResult.add(jobId); jobIdsFuture.complete(Collections.singletonList(jobId)); } else { jobIdsFuture.completeExceptionally( new ApplicationExecutionException("Could not execute application.", t)); } } } private CompletableFuture getApplicationResult( final DispatcherGateway dispatcherGateway, final Collection applicationJobIds, final Set tolerateMissingResult, final ScheduledExecutor executor) { final List> jobResultFutures = applicationJobIds.stream() .map( jobId -> unwrapJobResultException( getJobResult( dispatcherGateway, jobId, executor, tolerateMissingResult.contains(jobId)))) .collect(Collectors.toList()); return FutureUtils.waitForAll(jobResultFutures); } private CompletableFuture getJobResult( final DispatcherGateway dispatcherGateway, final JobID jobId, final ScheduledExecutor scheduledExecutor, final boolean tolerateMissingResult) { final Time timeout = Time.milliseconds(configuration.get(ClientOptions.CLIENT_TIMEOUT).toMillis()); final Time retryPeriod = Time.milliseconds(configuration.get(ClientOptions.CLIENT_RETRY_PERIOD).toMillis()); final CompletableFuture jobResultFuture = JobStatusPollingUtils.getJobResult( dispatcherGateway, jobId, scheduledExecutor, timeout, retryPeriod); if (tolerateMissingResult) { // Return "unknown" job result if dispatcher no longer knows the actual result. return FutureUtils.handleException( jobResultFuture, FlinkJobNotFoundException.class, exception -> new JobResult.Builder() .jobId(jobId) .applicationStatus(ApplicationStatus.UNKNOWN) .netRuntime(Long.MAX_VALUE) .build()); } return jobResultFuture; } /** * If the given {@link JobResult} indicates success, this passes through the {@link JobResult}. * Otherwise, this returns a future that is finished exceptionally (potentially with an * exception from the {@link JobResult}). */ private CompletableFuture unwrapJobResultException( final CompletableFuture jobResult) { return jobResult.thenApply( result -> { if (result.isSuccess()) { return result; } throw new CompletionException( UnsuccessfulExecutionException.fromJobResult( result, application.getUserCodeClassLoader())); }); } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.14/org/apache/flink/configuration/GlobalConfiguration.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.configuration; import com.zto.fire.common.conf.FireFrameworkConf; import com.zto.fire.common.util.OSUtils; import com.zto.fire.common.util.PropUtils; import org.apache.flink.annotation.Internal; import org.apache.flink.util.Preconditions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.collection.JavaConversions; import javax.annotation.Nullable; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.lang.reflect.Method; import java.net.ServerSocket; import java.util.HashMap; import java.util.Map; import java.util.concurrent.atomic.AtomicBoolean; /** * Global configuration object for Flink. Similar to Java properties configuration objects it * includes key-value pairs which represent the framework's configuration. */ @Internal public final class GlobalConfiguration { private static final Logger LOG = LoggerFactory.getLogger(GlobalConfiguration.class); public static final String FLINK_CONF_FILENAME = "flink-conf.yaml"; // the keys whose values should be hidden private static final String[] SENSITIVE_KEYS = new String[] {"password", "secret", "fs.azure.account.key", "apikey"}; // the hidden content to be displayed public static final String HIDDEN_CONTENT = "******"; // -------------------------------------------------------------------------------------------- private GlobalConfiguration() {} // -------------------------------------------------------------------------------------------- /** * Loads the global configuration from the environment. Fails if an error occurs during loading. * Returns an empty configuration object if the environment variable is not set. In production * this variable is set but tests and local execution/debugging don't have this environment * variable set. That's why we should fail if it is not set. * * @return Returns the Configuration */ public static Configuration loadConfiguration() { return loadConfiguration(new Configuration()); } /** * Loads the global configuration and adds the given dynamic properties configuration. * * @param dynamicProperties The given dynamic properties * @return Returns the loaded global configuration with dynamic properties */ public static Configuration loadConfiguration(Configuration dynamicProperties) { final String configDir = System.getenv(ConfigConstants.ENV_FLINK_CONF_DIR); if (configDir == null) { return new Configuration(dynamicProperties); } return loadConfiguration(configDir, dynamicProperties); } /** * Loads the configuration files from the specified directory. * *

YAML files are supported as configuration files. * * @param configDir the directory which contains the configuration files */ public static Configuration loadConfiguration(final String configDir) { return loadConfiguration(configDir, null); } /** * Loads the configuration files from the specified directory. If the dynamic properties * configuration is not null, then it is added to the loaded configuration. * * @param configDir directory to load the configuration from * @param dynamicProperties configuration file containing the dynamic properties. Null if none. * @return The configuration loaded from the given configuration directory */ public static Configuration loadConfiguration( final String configDir, @Nullable final Configuration dynamicProperties) { if (configDir == null) { throw new IllegalArgumentException( "Given configuration directory is null, cannot load configuration"); } final File confDirFile = new File(configDir); if (!(confDirFile.exists())) { throw new IllegalConfigurationException( "The given configuration directory name '" + configDir + "' (" + confDirFile.getAbsolutePath() + ") does not describe an existing directory."); } // get Flink yaml configuration file final File yamlConfigFile = new File(confDirFile, FLINK_CONF_FILENAME); if (!yamlConfigFile.exists()) { throw new IllegalConfigurationException( "The Flink config file '" + yamlConfigFile + "' (" + yamlConfigFile.getAbsolutePath() + ") does not exist."); } Configuration configuration = loadYAMLResource(yamlConfigFile); if (dynamicProperties != null) { configuration.addAll(dynamicProperties); } return configuration; } /** * Loads a YAML-file of key-value pairs. * *

Colon and whitespace ": " separate key and value (one per line). The hash tag "#" starts a * single-line comment. * *

Example: * *

     * jobmanager.rpc.address: localhost # network address for communication with the job manager
     * jobmanager.rpc.port   : 6123      # network port to connect to for communication with the job manager
     * taskmanager.rpc.port  : 6122      # network port the task manager expects incoming IPC connections
     * 
* *

This does not span the whole YAML specification, but only the *syntax* of simple YAML * key-value pairs (see issue #113 on GitHub). If at any point in time, there is a need to go * beyond simple key-value pairs syntax compatibility will allow to introduce a YAML parser * library. * * @param file the YAML file to read from * @see YAML 1.2 specification */ private static Configuration loadYAMLResource(File file) { final Configuration config = new Configuration(); // TODO: ------------ start:二次开发代码 --------------- // Method setSetting = null; try { Class env = Class.forName("org.apache.flink.runtime.util.EnvironmentInformation"); setSetting = env.getMethod("setSetting", String.class, String.class); } catch (Exception e) { LOG.error("获取EnvironmentInformation.setSetting()失败", e); } // TODO: ------------ end:二次开发代码 --------------- // try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)))) { String line; int lineNo = 0; while ((line = reader.readLine()) != null) { lineNo++; // 1. check for comments String[] comments = line.split("#", 2); String conf = comments[0].trim(); // 2. get key and value if (conf.length() > 0) { String[] kv = conf.split(": ", 2); // skip line with no valid key-value pair if (kv.length == 1) { LOG.warn( "Error while trying to split key and value in configuration file " + file + ":" + lineNo + ": \"" + line + "\""); continue; } String key = kv[0].trim(); String value = kv[1].trim(); // sanity check if (key.length() == 0 || value.length() == 0) { LOG.warn( "Error after splitting key and value in configuration file " + file + ":" + lineNo + ": \"" + line + "\""); continue; } LOG.info( "Loading configuration property: {}, {}", key, isSensitive(key) ? HIDDEN_CONTENT : value); config.setString(key, value); // TODO: ------------ start:二次开发代码 --------------- // try { setSetting.invoke(null, key, value); } catch (Exception e) { LOG.error("二次开发代码异常,反射调用配置产生失败!", e); } // TODO: ------------ end:二次开发代码 --------------- // } } } catch (IOException e) { throw new RuntimeException("Error parsing YAML configuration.", e); } // TODO: ------------ start:二次开发代码 --------------- // fireBootstrap(config); // TODO: ------------ end:二次开发代码 --------------- // return config; } // TODO: ------------ start:二次开发代码 --------------- // private static AtomicBoolean isStart = new AtomicBoolean(false); // 用于判断是JobManager还是TaskManager private static boolean isJobManager = false; // fire rest服务占用端口 private static ServerSocket restServerSocket; // 任务的运行模式 private static String runMode; private static final Map settings = new HashMap<>(); static { try { restServerSocket = new ServerSocket(0); } catch (Exception e) { LOG.error("创建Socket失败", e); } } /** * 获取配置信息 */ public static Map getSettings() { return settings; } /** * 获取随机分配的Rest端口号 */ public static int getRestPort() { return restServerSocket.getLocalPort(); } /** * 获取rest服务端口号,并关闭Socket */ public static int getRestPortAndClose() { int port = restServerSocket.getLocalPort(); if (restServerSocket != null && !restServerSocket.isClosed()) { try { restServerSocket.close(); } catch (Exception e) { LOG.error("关闭Rest Socket失败", e); } } return port; } /** * fire框架相关初始化动作 */ private static void fireBootstrap(Configuration config) { if (isStart.compareAndSet(false, true)) { // 加载必要的配置文件 loadTaskConfiguration(config); } } /** * 获取当前任务运行模式 */ public static String getRunMode() { return runMode; } /** * 加载必要的配置文件 */ private static void loadTaskConfiguration(Configuration config) { // 用于加载任务同名配置文件中的flink参数 // 获取当前任务的类名称 String className = config.getString("$internal.application.main", config.getString("flink.fire.className", "")); // 获取当前任务的运行模式:yarn-application或yarn-per-job runMode = config.getString("flink.execution.target", config.getString("execution.target", "")); try { Class env = Class.forName("org.apache.flink.runtime.util.EnvironmentInformation"); Method method = env.getMethod("isJobManager"); isJobManager = Boolean.valueOf(method.invoke(null) + ""); } catch (Exception e) { LOG.error("调用EnvironmentInformation.isJobManager()失败", e); } // 配置信息仅在JobManager端进行加载,TaskManager端会被主动的merge if (isJobManager && className != null && className.contains(".")) { String simpleClassName = className.substring(className.lastIndexOf('.') + 1); if (simpleClassName.length() > 0) { PropUtils.setProperty("driver.class.name", className); // TODO: 判断批处理模式,并加载对应配置文件 // PropUtils.load(FireFrameworkConf.FLINK_BATCH_CONF_FILE) PropUtils.loadFile(FireFrameworkConf.FLINK_STREAMING_CONF_FILE()); // 将所有configuration信息同步到PropUtils中 PropUtils.setProperties(config.confData); // 加载用户公共配置文件 PropUtils.load(FireFrameworkConf.userCommonConf()); // 加载任务同名的配置文件 // PropUtils.loadJobConf(className); // 构建fire rest接口地址 PropUtils.setProperty(FireFrameworkConf.FIRE_REST_URL(), "http://" + OSUtils.getIp() + ":" + getRestPort()); // 加载外部系统配置信息,覆盖同名配置文件中的配置,实现动态替换 PropUtils.loadJobConf(className); PropUtils.setProperty("flink.run.mode", runMode); Map settingMap = (Map) JavaConversions.mapAsJavaMap(PropUtils.settings()); settingMap.forEach((k, v) -> { config.setString(k, v); settings.put(k, v); }); LOG.info("main class:" + PropUtils.getProperty("driver.class.name")); } } } /** * Check whether the key is a hidden key. * * @param key the config key */ public static boolean isSensitive(String key) { Preconditions.checkNotNull(key, "key is null"); final String keyInLower = key.toLowerCase(); // 用于隐藏webui中敏感信息 String hideKeys = ((Map) JavaConversions.mapAsJavaMap(PropUtils.settings())).getOrDefault("fire.conf.print.blacklist", "password,secret,fs.azure.account.key"); if (hideKeys != null && hideKeys.length() > 0) { String[] hideKeyArr = hideKeys.split(","); for (String hideKey : hideKeyArr) { if (keyInLower.length() >= hideKey.length() && keyInLower.contains(hideKey)) { return true; } } } return false; } // TODO: ------------ end:二次开发代码 ----------------- // } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.14/org/apache/flink/connector/jdbc/dialect/AdbDialect.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.jdbc.dialect; import java.util.Optional; /** JDBC dialect for ADB. */ public class AdbDialect extends MySQLDialect { public boolean canHandle(String url) { return url.startsWith("jdbc:mysql:"); } public boolean canHandle(String url, String dialectOption) { return (url.startsWith("jdbc:mysql:") && url.contains("aliyuncs.com")); } /** * @return the default driver class name, if user not configure the driver class name, then will * use this one. */ public Optional defaultDriverName() { return Optional.of("com.mysql.jdbc.Driver"); } public String quoteIdentifier(String identifier) { return "`" + identifier + "`"; } /** * Mysql upsert query use DUPLICATE KEY UPDATE. * *

NOTE: It requires Mysql's primary key to be consistent with pkFields. * *

We don't use REPLACE INTO, if there are other fields, we can keep their previous values. */ public Optional getUpsertStatement( String tableName, String[] fieldNames, String[] uniqueKeyFields) { return Optional.of(getReplaceIntoStatement(tableName, fieldNames)); } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.14/org/apache/flink/connector/jdbc/dialect/JdbcDialect.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.jdbc.dialect; import org.apache.flink.annotation.Internal; import org.apache.flink.connector.jdbc.internal.converter.JdbcRowConverter; import org.apache.flink.table.api.TableSchema; import org.apache.flink.table.api.ValidationException; import org.apache.flink.table.types.logical.RowType; import java.io.Serializable; import java.util.Arrays; import java.util.Optional; import java.util.stream.Collectors; import static java.lang.String.format; /** Handle the SQL dialect of jdbc driver. */ @Internal public interface JdbcDialect extends Serializable { /** * Get the name of jdbc dialect. * * @return the dialect name. */ String dialectName(); /** * Check if this dialect instance can handle a certain jdbc url. * * @param url the jdbc url. * @return True if the dialect can be applied on the given jdbc url. */ boolean canHandle(String url); /** * Get converter that convert jdbc object and Flink internal object each other. * * @param rowType the given row type * @return a row converter for the database */ JdbcRowConverter getRowConverter(RowType rowType); /** * Get limit clause to limit the number of emitted row from the jdbc source. * * @param limit number of row to emit. The value of the parameter should be non-negative. * @return the limit clause. */ String getLimitClause(long limit); /** * Check if this dialect instance support a specific data type in table schema. * * @param schema the table schema. * @exception ValidationException in case of the table schema contains unsupported type. */ default void validate(TableSchema schema) throws ValidationException {} /** * @return the default driver class name, if user not configure the driver class name, then will * use this one. */ default Optional defaultDriverName() { return Optional.empty(); } /** * Quotes the identifier. This is used to put quotes around the identifier in case the column * name is a reserved keyword, or in case it contains characters that require quotes (e.g. * space). Default using double quotes {@code "} to quote. */ default String quoteIdentifier(String identifier) { return "\"" + identifier + "\""; } /** * Get dialect upsert statement, the database has its own upsert syntax, such as Mysql using * DUPLICATE KEY UPDATE, and PostgresSQL using ON CONFLICT... DO UPDATE SET.. * * @return None if dialect does not support upsert statement, the writer will degrade to the use * of select + update/insert, this performance is poor. */ default Optional getUpsertStatement( String tableName, String[] fieldNames, String[] uniqueKeyFields) { return Optional.empty(); } /** Get row exists statement by condition fields. Default use SELECT. */ default String getRowExistsStatement(String tableName, String[] conditionFields) { String fieldExpressions = Arrays.stream(conditionFields) .map(f -> format("%s = :%s", quoteIdentifier(f), f)) .collect(Collectors.joining(" AND ")); return "SELECT 1 FROM " + quoteIdentifier(tableName) + " WHERE " + fieldExpressions; } /** Get replace into statement. */ default String getReplaceIntoStatement(String tableName, String[] fieldNames) { String columns = Arrays.stream(fieldNames) .map(this::quoteIdentifier) .collect(Collectors.joining(", ")); String placeholders = Arrays.stream(fieldNames) .map(f -> ":" + f) .collect(Collectors.joining(", ")); return "REPLACE INTO " + quoteIdentifier(tableName) + "(" + columns + ") VALUES (" + placeholders + ")"; } /** Get insert into statement. */ default String getInsertIntoStatement(String tableName, String[] fieldNames) { String columns = Arrays.stream(fieldNames) .map(this::quoteIdentifier) .collect(Collectors.joining(", ")); String placeholders = Arrays.stream(fieldNames).map(f -> ":" + f).collect(Collectors.joining(", ")); return "INSERT INTO " + quoteIdentifier(tableName) + "(" + columns + ")" + " VALUES (" + placeholders + ")"; } /** * Get update one row statement by condition fields, default not use limit 1, because limit 1 is * a sql dialect. */ default String getUpdateStatement( String tableName, String[] fieldNames, String[] conditionFields) { String setClause = Arrays.stream(fieldNames) .map(f -> format("%s = :%s", quoteIdentifier(f), f)) .collect(Collectors.joining(", ")); String conditionClause = Arrays.stream(conditionFields) .map(f -> format("%s = :%s", quoteIdentifier(f), f)) .collect(Collectors.joining(" AND ")); return "UPDATE " + quoteIdentifier(tableName) + " SET " + setClause + " WHERE " + conditionClause; } /** * Get delete one row statement by condition fields, default not use limit 1, because limit 1 is * a sql dialect. */ default String getDeleteStatement(String tableName, String[] conditionFields) { String conditionClause = Arrays.stream(conditionFields) .map(f -> format("%s = :%s", quoteIdentifier(f), f)) .collect(Collectors.joining(" AND ")); return "DELETE FROM " + quoteIdentifier(tableName) + " WHERE " + conditionClause; } /** Get select fields statement by condition fields. Default use SELECT. */ default String getSelectFromStatement( String tableName, String[] selectFields, String[] conditionFields) { String selectExpressions = Arrays.stream(selectFields) .map(this::quoteIdentifier) .collect(Collectors.joining(", ")); String fieldExpressions = Arrays.stream(conditionFields) .map(f -> format("%s = :%s", quoteIdentifier(f), f)) .collect(Collectors.joining(" AND ")); return "SELECT " + selectExpressions + " FROM " + quoteIdentifier(tableName) + (conditionFields.length > 0 ? " WHERE " + fieldExpressions : ""); } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.14/org/apache/flink/connector/jdbc/dialect/JdbcDialects.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.jdbc.dialect; import java.util.Arrays; import java.util.List; import java.util.Optional; /** Default JDBC dialects. */ public final class JdbcDialects { // private static final List DIALECTS = // Arrays.asList(new DerbyDialect(), new MySQLDialect(), new PostgresDialect()); private static final List DIALECTS = Arrays.asList( new DerbyDialect(), new MySQLDialect(), new OracleSQLDialect(), new PostgresDialect(), new AdbDialect()); /** Fetch the JdbcDialect class corresponding to a given database url. */ public static Optional get(String url) { for (JdbcDialect dialect : DIALECTS) { if (dialect.canHandle(url)) { return Optional.of(dialect); } } return Optional.empty(); } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.14/org/apache/flink/connector/jdbc/dialect/MySQLDialect.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.jdbc.dialect; import org.apache.flink.connector.jdbc.internal.converter.JdbcRowConverter; import org.apache.flink.connector.jdbc.internal.converter.MySQLRowConverter; import org.apache.flink.table.types.logical.LogicalTypeRoot; import org.apache.flink.table.types.logical.RowType; import java.util.Arrays; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; /** JDBC dialect for MySQL. */ public class MySQLDialect extends AbstractDialect { private static final long serialVersionUID = 1L; // Define MAX/MIN precision of TIMESTAMP type according to Mysql docs: // https://dev.mysql.com/doc/refman/8.0/en/fractional-seconds.html private static final int MAX_TIMESTAMP_PRECISION = 6; private static final int MIN_TIMESTAMP_PRECISION = 1; // Define MAX/MIN precision of DECIMAL type according to Mysql docs: // https://dev.mysql.com/doc/refman/8.0/en/fixed-point-types.html private static final int MAX_DECIMAL_PRECISION = 65; private static final int MIN_DECIMAL_PRECISION = 1; @Override public boolean canHandle(String url) { return (url.startsWith("jdbc:mysql:") && !url.contains("aliyuncs.com")); } @Override public JdbcRowConverter getRowConverter(RowType rowType) { return new MySQLRowConverter(rowType); } @Override public String getLimitClause(long limit) { return "LIMIT " + limit; } @Override public Optional defaultDriverName() { return Optional.of("com.mysql.jdbc.Driver"); } @Override public String quoteIdentifier(String identifier) { return "`" + identifier + "`"; } /** * Mysql upsert query use DUPLICATE KEY UPDATE. * *

NOTE: It requires Mysql's primary key to be consistent with pkFields. * *

We don't use REPLACE INTO, if there are other fields, we can keep their previous values. */ @Override public Optional getUpsertStatement( String tableName, String[] fieldNames, String[] uniqueKeyFields) { String updateClause = Arrays.stream(fieldNames) .map(f -> quoteIdentifier(f) + "=VALUES(" + quoteIdentifier(f) + ")") .collect(Collectors.joining(", ")); return Optional.of( getInsertIntoStatement(tableName, fieldNames) + " ON DUPLICATE KEY UPDATE " + updateClause); } @Override public String dialectName() { return "MySQL"; } @Override public int maxDecimalPrecision() { return MAX_DECIMAL_PRECISION; } @Override public int minDecimalPrecision() { return MIN_DECIMAL_PRECISION; } @Override public int maxTimestampPrecision() { return MAX_TIMESTAMP_PRECISION; } @Override public int minTimestampPrecision() { return MIN_TIMESTAMP_PRECISION; } @Override public List unsupportedTypes() { // The data types used in Mysql are list at: // https://dev.mysql.com/doc/refman/8.0/en/data-types.html // TODO: We can't convert BINARY data type to // PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO in // LegacyTypeInfoDataTypeConverter. return Arrays.asList( LogicalTypeRoot.BINARY, LogicalTypeRoot.TIMESTAMP_WITH_LOCAL_TIME_ZONE, LogicalTypeRoot.TIMESTAMP_WITH_TIME_ZONE, LogicalTypeRoot.INTERVAL_YEAR_MONTH, LogicalTypeRoot.INTERVAL_DAY_TIME, LogicalTypeRoot.ARRAY, LogicalTypeRoot.MULTISET, LogicalTypeRoot.MAP, LogicalTypeRoot.ROW, LogicalTypeRoot.DISTINCT_TYPE, LogicalTypeRoot.STRUCTURED_TYPE, LogicalTypeRoot.NULL, LogicalTypeRoot.RAW, LogicalTypeRoot.SYMBOL, LogicalTypeRoot.UNRESOLVED); } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.14/org/apache/flink/connector/jdbc/dialect/OracleSQLDialect.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.connector.jdbc.dialect; import org.apache.flink.connector.jdbc.internal.converter.JdbcRowConverter; import org.apache.flink.connector.jdbc.internal.converter.OracleSQLRowConverter; import org.apache.flink.table.types.logical.LogicalTypeRoot; import org.apache.flink.table.types.logical.RowType; import org.apache.commons.lang3.StringUtils; import java.util.Arrays; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; /** JDBC dialect for Oracle. */ public class OracleSQLDialect extends AbstractDialect { private static final long serialVersionUID = 1L; private static final String SQL_DEFAULT_PLACEHOLDER = " :"; // Define MAX/MIN precision of TIMESTAMP type according to Mysql docs: // https://dev.mysql.com/doc/refman/8.0/en/fractional-seconds.html private static final int MAX_TIMESTAMP_PRECISION = 6; private static final int MIN_TIMESTAMP_PRECISION = 1; // Define MAX/MIN precision of DECIMAL type according to Mysql docs: // https://dev.mysql.com/doc/refman/8.0/en/fixed-point-types.html private static final int MAX_DECIMAL_PRECISION = 65; private static final int MIN_DECIMAL_PRECISION = 1; @Override public boolean canHandle(String url) { return url.startsWith("jdbc:oracle:"); } @Override public JdbcRowConverter getRowConverter(RowType rowType) { return new OracleSQLRowConverter(rowType); } @Override public String getLimitClause(long limit) { return null; } @Override public Optional defaultDriverName() { return Optional.of("oracle.jdbc.driver.OracleDriver"); } @Override public String quoteIdentifier(String identifier) { return " " + identifier + " "; } /** * Mysql upsert query use DUPLICATE KEY UPDATE. * *

NOTE: It requires Mysql's primary key to be consistent with pkFields. * *

We don't use REPLACE INTO, if there are other fields, we can keep their previous values. */ @Override public Optional getUpsertStatement( String tableName, String[] fieldNames, String[] uniqueKeyFields) { return Optional.of(getUpsertStatement(tableName, fieldNames, uniqueKeyFields, true)); } public String getUpsertStatement( String tableName, String[] fieldNames, String[] uniqueKeyFields, boolean allReplace) { StringBuilder mergeIntoSql = new StringBuilder(); mergeIntoSql .append("MERGE INTO " + tableName + " T1 USING (") .append(buildDualQueryStatement(fieldNames)) .append(") T2 ON (") .append(buildConnectionConditions(uniqueKeyFields) + ") "); String updateSql = buildUpdateConnection(fieldNames, uniqueKeyFields, allReplace); if (StringUtils.isNotEmpty(updateSql)) { mergeIntoSql.append(" WHEN MATCHED THEN UPDATE SET "); mergeIntoSql.append(updateSql); } mergeIntoSql .append(" WHEN NOT MATCHED THEN ") .append("INSERT (") .append( Arrays.stream(fieldNames) .map(col -> quoteIdentifier(col)) .collect(Collectors.joining(","))) .append(") VALUES (") .append( Arrays.stream(fieldNames) .map(col -> "T2." + quoteIdentifier(col)) .collect(Collectors.joining(","))) .append(")"); return mergeIntoSql.toString(); } private String buildUpdateConnection( String[] fieldNames, String[] uniqueKeyFields, boolean allReplace) { List uniqueKeyList = Arrays.asList(uniqueKeyFields); String updateConnectionSql = Arrays.stream(fieldNames) .filter( col -> { boolean bbool = uniqueKeyList.contains(col.toLowerCase()) || uniqueKeyList.contains( col.toUpperCase()) ? false : true; return bbool; }) .map(col -> buildConnectionByAllReplace(allReplace, col)) .collect(Collectors.joining(",")); return updateConnectionSql; } private String buildConnectionByAllReplace(boolean allReplace, String col) { String conncetionSql = allReplace ? quoteIdentifier("T1") + "." + quoteIdentifier(col) + " = " + quoteIdentifier("T2") + "." + quoteIdentifier(col) : quoteIdentifier("T1") + "." + quoteIdentifier(col) + " =nvl(" + quoteIdentifier("T2") + "." + quoteIdentifier(col) + "," + quoteIdentifier("T1") + "." + quoteIdentifier(col) + ")"; return conncetionSql; } private String buildConnectionConditions(String[] uniqueKeyFields) { return Arrays.stream(uniqueKeyFields) .map( col -> "T1." + quoteIdentifier(col.trim()) + "=T2." + quoteIdentifier(col.trim())) .collect(Collectors.joining(" and ")); } public String buildDualQueryStatement(String[] column) { StringBuilder sb = new StringBuilder("SELECT "); String collect = Arrays.stream(column) .map(col -> wrapperPlaceholder(col) + quoteIdentifier(col)) .collect(Collectors.joining(", ")); sb.append(collect).append(" FROM DUAL"); return sb.toString(); } public String wrapperPlaceholder(String fieldName) { return SQL_DEFAULT_PLACEHOLDER + fieldName + " "; } @Override public String dialectName() { return "Oracle"; } @Override public int maxDecimalPrecision() { return MAX_DECIMAL_PRECISION; } @Override public int minDecimalPrecision() { return MIN_DECIMAL_PRECISION; } @Override public int maxTimestampPrecision() { return MAX_TIMESTAMP_PRECISION; } @Override public int minTimestampPrecision() { return MIN_TIMESTAMP_PRECISION; } @Override public List unsupportedTypes() { // The data types used in Mysql are list at: // https://dev.mysql.com/doc/refman/8.0/en/data-types.html // TODO: We can't convert BINARY data type to // PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO in // LegacyTypeInfoDataTypeConverter. return Arrays.asList( LogicalTypeRoot.BINARY, LogicalTypeRoot.TIMESTAMP_WITH_LOCAL_TIME_ZONE, LogicalTypeRoot.TIMESTAMP_WITH_TIME_ZONE, LogicalTypeRoot.INTERVAL_YEAR_MONTH, LogicalTypeRoot.INTERVAL_DAY_TIME, LogicalTypeRoot.ARRAY, LogicalTypeRoot.MULTISET, LogicalTypeRoot.MAP, LogicalTypeRoot.ROW, LogicalTypeRoot.DISTINCT_TYPE, LogicalTypeRoot.STRUCTURED_TYPE, LogicalTypeRoot.NULL, LogicalTypeRoot.RAW, LogicalTypeRoot.SYMBOL, LogicalTypeRoot.UNRESOLVED); } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.14/org/apache/flink/connector/jdbc/internal/converter/OracleSQLRowConverter.java ================================================ package org.apache.flink.connector.jdbc.internal.converter; import org.apache.flink.table.types.logical.RowType; /** * Runtime converter that responsible to convert between JDBC object and Flink internal object for * Oracle. */ public class OracleSQLRowConverter extends AbstractJdbcRowConverter { private static final long serialVersionUID = 1L; @Override public String converterName() { return "Oracle"; } public OracleSQLRowConverter(RowType rowType) { super(rowType); } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.14/org/apache/flink/contrib/streaming/state/EmbeddedRocksDBStateBackend.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.contrib.streaming.state; import com.zto.fire.common.util.PropUtils; import org.apache.commons.lang3.StringUtils; import org.apache.curator.framework.CuratorFramework; import org.apache.curator.framework.CuratorFrameworkFactory; import org.apache.curator.framework.recipes.atomic.AtomicValue; import org.apache.curator.framework.recipes.atomic.DistributedAtomicInteger; import org.apache.curator.retry.ExponentialBackoffRetry; import org.apache.curator.retry.RetryOneTime; import org.apache.flink.annotation.PublicEvolving; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.ExecutionConfig; import org.apache.flink.api.common.JobID; import org.apache.flink.api.common.typeutils.TypeSerializer; import org.apache.flink.configuration.*; import org.apache.flink.configuration.description.InlineElement; import org.apache.flink.core.fs.CloseableRegistry; import org.apache.flink.core.fs.Path; import org.apache.flink.metrics.MetricGroup; import org.apache.flink.runtime.execution.Environment; import org.apache.flink.runtime.memory.OpaqueMemoryResource; import org.apache.flink.runtime.query.TaskKvStateRegistry; import org.apache.flink.runtime.state.AbstractKeyedStateBackend; import org.apache.flink.runtime.state.AbstractManagedMemoryStateBackend; import org.apache.flink.runtime.state.ConfigurableStateBackend; import org.apache.flink.runtime.state.DefaultOperatorStateBackendBuilder; import org.apache.flink.runtime.state.KeyGroupRange; import org.apache.flink.runtime.state.KeyedStateHandle; import org.apache.flink.runtime.state.LocalRecoveryConfig; import org.apache.flink.runtime.state.OperatorStateBackend; import org.apache.flink.runtime.state.OperatorStateHandle; import org.apache.flink.runtime.state.StreamCompressionDecorator; import org.apache.flink.runtime.state.metrics.LatencyTrackingStateConfig; import org.apache.flink.runtime.state.ttl.TtlTimeProvider; import org.apache.flink.util.AbstractID; import org.apache.flink.util.DynamicCodeLoadingException; import org.apache.flink.util.FileUtils; import org.apache.flink.util.FlinkRuntimeException; import org.apache.flink.util.Preconditions; import org.apache.flink.util.TernaryBoolean; import org.rocksdb.NativeLibraryLoader; import org.rocksdb.RocksDB; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nonnull; import javax.annotation.Nullable; import java.io.File; import java.io.IOException; import java.lang.reflect.Field; import java.net.URI; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Random; import java.util.UUID; import static org.apache.flink.configuration.description.TextElement.text; import static org.apache.flink.contrib.streaming.state.RocksDBConfigurableOptions.WRITE_BATCH_SIZE; import static org.apache.flink.contrib.streaming.state.RocksDBOptions.CHECKPOINT_TRANSFER_THREAD_NUM; import static org.apache.flink.contrib.streaming.state.RocksDBOptions.TIMER_SERVICE_FACTORY; import static org.apache.flink.util.Preconditions.checkArgument; import static org.apache.flink.util.Preconditions.checkNotNull; /** * A {@link org.apache.flink.runtime.state.StateBackend} that stores its state in an embedded {@code * RocksDB} instance. This state backend can store very large state that exceeds memory and spills * to local disk. All key/value state (including windows) is stored in the key/value index of * RocksDB. For persistence against loss of machines, please configure a {@link * org.apache.flink.runtime.state.CheckpointStorage} instance for the Job. * *

The behavior of the RocksDB instances can be parametrized by setting RocksDB Options using the * methods {@link #setPredefinedOptions(PredefinedOptions)} and {@link * #setRocksDBOptions(RocksDBOptionsFactory)}. */ @PublicEvolving public class EmbeddedRocksDBStateBackend extends AbstractManagedMemoryStateBackend implements ConfigurableStateBackend { private static final long serialVersionUID = 1L; private static final Logger LOG = LoggerFactory.getLogger(EmbeddedRocksDBStateBackend.class); /** The number of (re)tries for loading the RocksDB JNI library. */ private static final int ROCKSDB_LIB_LOADING_ATTEMPTS = 3; /** Flag whether the native library has been loaded. */ private static boolean rocksDbInitialized = false; private static final int UNDEFINED_NUMBER_OF_TRANSFER_THREADS = -1; private static final long UNDEFINED_WRITE_BATCH_SIZE = -1; // ------------------------------------------------------------------------ // -- configuration values, set in the application / configuration /** * Base paths for RocksDB directory, as configured. Null if not yet set, in which case the * configuration values will be used. The configuration defaults to the TaskManager's temp * directories. */ @Nullable private File[] localRocksDbDirectories; /** The pre-configured option settings. */ @Nullable private PredefinedOptions predefinedOptions; /** The options factory to create the RocksDB options in the cluster. */ @Nullable private RocksDBOptionsFactory rocksDbOptionsFactory; /** This determines if incremental checkpointing is enabled. */ private final TernaryBoolean enableIncrementalCheckpointing; /** Thread number used to transfer (download and upload) state, default value: 1. */ private int numberOfTransferThreads; /** The configuration for memory settings (pool sizes, etc.). */ private final RocksDBMemoryConfiguration memoryConfiguration; /** This determines the type of priority queue state. */ @Nullable private PriorityQueueStateType priorityQueueStateType; /** The default rocksdb metrics options. */ private final RocksDBNativeMetricOptions defaultMetricOptions; // -- runtime values, set on TaskManager when initializing / using the backend /** Base paths for RocksDB directory, as initialized. */ private transient File[] initializedDbBasePaths; /** JobID for uniquifying backup paths. */ private transient JobID jobId; /** The index of the next directory to be used from {@link #initializedDbBasePaths}. */ private transient int nextDirectory; /** Whether we already lazily initialized our local storage directories. */ private transient boolean isInitialized; /** * Max consumed memory size for one batch in {@link RocksDBWriteBatchWrapper}, default value * 2mb. */ private long writeBatchSize; // ------------------------------------------------------------------------ // TODO: ------------ start:二次开发代码 --------------- // /** * State disk choose policy */ private static final String FLINK_STATE_DISK_CHOOSE_POLICY_ROUND_ROBIN = "ROUND_ROBIN"; /** * Default state disk choose policy */ private static final String FLINK_STATE_DISK_CHOOSE_POLICY_DEFAULT = "DEFAULT"; /** * distributed dir on each taskManager */ private DistributedAtomicInteger dirIndex; /** * state choose disk policy */ private String stateDiskPolicy; private transient CuratorFramework client; private String currentHostName; // 初始化标识,避免多次初始化 private boolean isInitZKClient = false; // 用于统计磁盘负载的zk地址 private final static String STATE_ZOOKEEPER_URL = "flink.state.external.zookeeper.url"; // 状态本地磁盘路径选取策略:default/round_robin private final static String STATE_CHOOSE_DISK_POLICY = "flink.state.choose.disk.policy"; /** * 初始化round_robin策略下的zookeeper连接 */ private void initZKClient() { synchronized (EmbeddedRocksDBStateBackend.class) { if (isInitZKClient) return; this.isInitZKClient = true; final String zkUrl = PropUtils.getString(STATE_ZOOKEEPER_URL, ""); this.stateDiskPolicy = PropUtils.getString(STATE_CHOOSE_DISK_POLICY, FLINK_STATE_DISK_CHOOSE_POLICY_DEFAULT).toUpperCase(); LOG.info("当前磁盘路径选择策略:" + this.stateDiskPolicy); // 如果zk地址不为空,并且开启了ROUND_ROBIN磁盘路径选择策略,则建立zookeeper的连接,避免太多任务建立太多的连接 if (StringUtils.isNotBlank(zkUrl) && this.isRoundRobin()) { try { LOG.info("开启基于zookeeper的本地磁盘状态路径选择策略"); this.client = CuratorFrameworkFactory.builder().connectString(zkUrl) .connectionTimeoutMs(5000).retryPolicy(new RetryOneTime(5000)).build(); this.client.start(); Runtime.getRuntime().addShutdownHook(new Thread(() -> { if (client != null) { client.close(); LOG.info("释放基于zookeeper的本地磁盘状态路径选择策略的连接"); } })); } catch (Exception e) { LOG.error("初始化CuratorFrameworkFactory失败", e); } } } } /** * 判断是否为ROUND_ROBIN模式 */ private boolean isRoundRobin() { if (!this.isInitZKClient) this.initZKClient(); return FLINK_STATE_DISK_CHOOSE_POLICY_ROUND_ROBIN.equalsIgnoreCase(this.stateDiskPolicy); } // TODO: ------------ end:二次开发代码 --------------- // /** Creates a new {@code EmbeddedRocksDBStateBackend} for storing local state. */ public EmbeddedRocksDBStateBackend() { this(TernaryBoolean.UNDEFINED); } /** * Creates a new {@code EmbeddedRocksDBStateBackend} for storing local state. * * @param enableIncrementalCheckpointing True if incremental checkpointing is enabled. */ public EmbeddedRocksDBStateBackend(boolean enableIncrementalCheckpointing) { this(TernaryBoolean.fromBoolean(enableIncrementalCheckpointing)); } /** * Creates a new {@code EmbeddedRocksDBStateBackend} for storing local state. * * @param enableIncrementalCheckpointing True if incremental checkpointing is enabled. */ public EmbeddedRocksDBStateBackend(TernaryBoolean enableIncrementalCheckpointing) { this.enableIncrementalCheckpointing = enableIncrementalCheckpointing; this.numberOfTransferThreads = UNDEFINED_NUMBER_OF_TRANSFER_THREADS; this.defaultMetricOptions = new RocksDBNativeMetricOptions(); this.memoryConfiguration = new RocksDBMemoryConfiguration(); this.writeBatchSize = UNDEFINED_WRITE_BATCH_SIZE; // TODO: ------------ start:二次开发代码 --------------- // this.initZKClient(); // TODO: ------------ end:二次开发代码 --------------- // } /** * Private constructor that creates a re-configured copy of the state backend. * * @param original The state backend to re-configure. * @param config The configuration. * @param classLoader The class loader. */ private EmbeddedRocksDBStateBackend( EmbeddedRocksDBStateBackend original, ReadableConfig config, ClassLoader classLoader) { // configure incremental checkpoints this.enableIncrementalCheckpointing = original.enableIncrementalCheckpointing.resolveUndefined( config.get(CheckpointingOptions.INCREMENTAL_CHECKPOINTS)); if (original.numberOfTransferThreads == UNDEFINED_NUMBER_OF_TRANSFER_THREADS) { this.numberOfTransferThreads = config.get(CHECKPOINT_TRANSFER_THREAD_NUM); } else { this.numberOfTransferThreads = original.numberOfTransferThreads; } if (original.writeBatchSize == UNDEFINED_WRITE_BATCH_SIZE) { this.writeBatchSize = config.get(WRITE_BATCH_SIZE).getBytes(); } else { this.writeBatchSize = original.writeBatchSize; } this.memoryConfiguration = RocksDBMemoryConfiguration.fromOtherAndConfiguration( original.memoryConfiguration, config); this.memoryConfiguration.validate(); if (null == original.priorityQueueStateType) { this.priorityQueueStateType = config.get(TIMER_SERVICE_FACTORY); } else { this.priorityQueueStateType = original.priorityQueueStateType; } // configure local directories if (original.localRocksDbDirectories != null) { this.localRocksDbDirectories = original.localRocksDbDirectories; } else { final String rocksdbLocalPaths = config.get(RocksDBOptions.LOCAL_DIRECTORIES); if (rocksdbLocalPaths != null) { String[] directories = rocksdbLocalPaths.split(",|" + File.pathSeparator); try { setDbStoragePaths(directories); } catch (IllegalArgumentException e) { throw new IllegalConfigurationException( "Invalid configuration for RocksDB state " + "backend's local storage directories: " + e.getMessage(), e); } } } // configure metric options this.defaultMetricOptions = RocksDBNativeMetricOptions.fromConfig(config); // configure RocksDB predefined options this.predefinedOptions = original.predefinedOptions == null ? PredefinedOptions.valueOf(config.get(RocksDBOptions.PREDEFINED_OPTIONS)) : original.predefinedOptions; LOG.info("Using predefined options: {}.", predefinedOptions.name()); // configure RocksDB options factory try { rocksDbOptionsFactory = configureOptionsFactory( original.rocksDbOptionsFactory, config.get(RocksDBOptions.OPTIONS_FACTORY), config, classLoader); } catch (DynamicCodeLoadingException e) { throw new FlinkRuntimeException(e); } // configure latency tracking latencyTrackingConfigBuilder = original.latencyTrackingConfigBuilder.configure(config); // TODO: ------------ start:二次开发代码 --------------- // this.initZKClient(); // TODO: ------------ end:二次开发代码 --------------- // } // ------------------------------------------------------------------------ // Reconfiguration // ------------------------------------------------------------------------ /** * Creates a copy of this state backend that uses the values defined in the configuration for * fields where that were not yet specified in this state backend. * * @param config The configuration. * @param classLoader The class loader. * @return The re-configured variant of the state backend */ @Override public EmbeddedRocksDBStateBackend configure(ReadableConfig config, ClassLoader classLoader) { return new EmbeddedRocksDBStateBackend(this, config, classLoader); } // ------------------------------------------------------------------------ // State backend methods // ------------------------------------------------------------------------ private void lazyInitializeForJob( Environment env, @SuppressWarnings("unused") String operatorIdentifier) throws IOException { if (isInitialized) { return; } this.jobId = env.getJobID(); // initialize the paths where the local RocksDB files should be stored if (localRocksDbDirectories == null) { // initialize from the temp directories initializedDbBasePaths = env.getIOManager().getSpillingDirectories(); } else { List dirs = new ArrayList<>(localRocksDbDirectories.length); StringBuilder errorMessage = new StringBuilder(); for (File f : localRocksDbDirectories) { File testDir = new File(f, UUID.randomUUID().toString()); if (!testDir.mkdirs()) { String msg = "Local DB files directory '" + f + "' does not exist and cannot be created. "; LOG.error(msg); errorMessage.append(msg); } else { dirs.add(f); } //noinspection ResultOfMethodCallIgnored testDir.delete(); } if (dirs.isEmpty()) { throw new IOException("No local storage directories available. " + errorMessage); } else { initializedDbBasePaths = dirs.toArray(new File[0]); } } // TODO: ------------ start:二次开发代码 --------------- // if (isRoundRobin()) { this.currentHostName = env.getTaskManagerInfo().getConfiguration().getString( TaskManagerOptions.HOST); } // TODO: ------------ end:二次开发代码 --------------- // nextDirectory = new Random().nextInt(initializedDbBasePaths.length); isInitialized = true; } private File getNextStoragePath() { // TODO: ------------ start:二次开发代码 --------------- // int ni = nextDirectory; if (isRoundRobin()) { try { String counterPath = "/rocksDB/" + this.currentHostName; ExponentialBackoffRetry retryPolicy = new ExponentialBackoffRetry(1000, 10); this.dirIndex = new DistributedAtomicInteger(this.client, counterPath, retryPolicy); this.dirIndex.initialize(0); AtomicValue value = this.dirIndex.increment(); if (value.succeeded()) { ni = value.postValue() % initializedDbBasePaths.length; } else { ni = new Random().nextInt(initializedDbBasePaths.length); } } catch (Exception e) { ni = new Random().nextInt(initializedDbBasePaths.length); LOG.error("基于zookeeper的本地状态磁盘路径选择发生异常,请在commons.properties文件中指定以下参数恢复到flink默认的选择策略:flink.state.choose.disk.policy=default", e); } } else { ni = nextDirectory + 1; ni = ni >= initializedDbBasePaths.length ? 0 : ni; nextDirectory = ni; } LOG.info("Next state file storage path is: " + initializedDbBasePaths[ni].getPath()); // TODO: ------------ end:二次开发代码 --------------- // return initializedDbBasePaths[ni]; } // ------------------------------------------------------------------------ // State holding data structures // ------------------------------------------------------------------------ @Override public AbstractKeyedStateBackend createKeyedStateBackend( Environment env, JobID jobID, String operatorIdentifier, TypeSerializer keySerializer, int numberOfKeyGroups, KeyGroupRange keyGroupRange, TaskKvStateRegistry kvStateRegistry, TtlTimeProvider ttlTimeProvider, MetricGroup metricGroup, @Nonnull Collection stateHandles, CloseableRegistry cancelStreamRegistry) throws IOException { return createKeyedStateBackend( env, jobID, operatorIdentifier, keySerializer, numberOfKeyGroups, keyGroupRange, kvStateRegistry, ttlTimeProvider, metricGroup, stateHandles, cancelStreamRegistry, 1.0); } @Override public AbstractKeyedStateBackend createKeyedStateBackend( Environment env, JobID jobID, String operatorIdentifier, TypeSerializer keySerializer, int numberOfKeyGroups, KeyGroupRange keyGroupRange, TaskKvStateRegistry kvStateRegistry, TtlTimeProvider ttlTimeProvider, MetricGroup metricGroup, @Nonnull Collection stateHandles, CloseableRegistry cancelStreamRegistry, double managedMemoryFraction) throws IOException { // first, make sure that the RocksDB JNI library is loaded // we do this explicitly here to have better error handling String tempDir = env.getTaskManagerInfo().getTmpDirectories()[0]; ensureRocksDBIsLoaded(tempDir); // replace all characters that are not legal for filenames with underscore String fileCompatibleIdentifier = operatorIdentifier.replaceAll("[^a-zA-Z0-9\\-]", "_"); lazyInitializeForJob(env, fileCompatibleIdentifier); File instanceBasePath = new File( getNextStoragePath(), "job_" + jobId + "_op_" + fileCompatibleIdentifier + "_uuid_" + UUID.randomUUID()); LocalRecoveryConfig localRecoveryConfig = env.getTaskStateManager().createLocalRecoveryConfig(); final OpaqueMemoryResource sharedResources = RocksDBOperationUtils.allocateSharedCachesIfConfigured( memoryConfiguration, env.getMemoryManager(), managedMemoryFraction, LOG); if (sharedResources != null) { LOG.info("Obtained shared RocksDB cache of size {} bytes", sharedResources.getSize()); } final RocksDBResourceContainer resourceContainer = createOptionsAndResourceContainer(sharedResources); ExecutionConfig executionConfig = env.getExecutionConfig(); StreamCompressionDecorator keyGroupCompressionDecorator = getCompressionDecorator(executionConfig); LatencyTrackingStateConfig latencyTrackingStateConfig = latencyTrackingConfigBuilder.setMetricGroup(metricGroup).build(); RocksDBKeyedStateBackendBuilder builder = new RocksDBKeyedStateBackendBuilder<>( operatorIdentifier, env.getUserCodeClassLoader().asClassLoader(), instanceBasePath, resourceContainer, stateName -> resourceContainer.getColumnOptions(), kvStateRegistry, keySerializer, numberOfKeyGroups, keyGroupRange, executionConfig, localRecoveryConfig, getPriorityQueueStateType(), ttlTimeProvider, latencyTrackingStateConfig, metricGroup, stateHandles, keyGroupCompressionDecorator, cancelStreamRegistry) .setEnableIncrementalCheckpointing(isIncrementalCheckpointsEnabled()) .setNumberOfTransferingThreads(getNumberOfTransferThreads()) .setNativeMetricOptions( resourceContainer.getMemoryWatcherOptions(defaultMetricOptions)) .setWriteBatchSize(getWriteBatchSize()); return builder.build(); } @Override public OperatorStateBackend createOperatorStateBackend( Environment env, String operatorIdentifier, @Nonnull Collection stateHandles, CloseableRegistry cancelStreamRegistry) throws Exception { // the default for RocksDB; eventually there can be a operator state backend based on // RocksDB, too. final boolean asyncSnapshots = true; return new DefaultOperatorStateBackendBuilder( env.getUserCodeClassLoader().asClassLoader(), env.getExecutionConfig(), asyncSnapshots, stateHandles, cancelStreamRegistry) .build(); } private RocksDBOptionsFactory configureOptionsFactory( @Nullable RocksDBOptionsFactory originalOptionsFactory, String factoryClassName, ReadableConfig config, ClassLoader classLoader) throws DynamicCodeLoadingException { if (originalOptionsFactory != null) { if (originalOptionsFactory instanceof ConfigurableRocksDBOptionsFactory) { originalOptionsFactory = ((ConfigurableRocksDBOptionsFactory) originalOptionsFactory) .configure(config); } LOG.info("Using application-defined options factory: {}.", originalOptionsFactory); return originalOptionsFactory; } // if using DefaultConfigurableOptionsFactory by default, we could avoid reflection to speed // up. if (factoryClassName.equalsIgnoreCase(DefaultConfigurableOptionsFactory.class.getName())) { DefaultConfigurableOptionsFactory optionsFactory = new DefaultConfigurableOptionsFactory(); optionsFactory.configure(config); LOG.info("Using default options factory: {}.", optionsFactory); return optionsFactory; } else { try { Class clazz = Class.forName(factoryClassName, false, classLoader) .asSubclass(RocksDBOptionsFactory.class); RocksDBOptionsFactory optionsFactory = clazz.newInstance(); if (optionsFactory instanceof ConfigurableRocksDBOptionsFactory) { optionsFactory = ((ConfigurableRocksDBOptionsFactory) optionsFactory).configure(config); } LOG.info("Using configured options factory: {}.", optionsFactory); return optionsFactory; } catch (ClassNotFoundException e) { throw new DynamicCodeLoadingException( "Cannot find configured options factory class: " + factoryClassName, e); } catch (ClassCastException | InstantiationException | IllegalAccessException e) { throw new DynamicCodeLoadingException( "The class configured under '" + RocksDBOptions.OPTIONS_FACTORY.key() + "' is not a valid options factory (" + factoryClassName + ')', e); } } } // ------------------------------------------------------------------------ // Parameters // ------------------------------------------------------------------------ /** * Gets the memory configuration object, which offers settings to control RocksDB's memory * usage. */ public RocksDBMemoryConfiguration getMemoryConfiguration() { return memoryConfiguration; } /** * Sets the path where the RocksDB local database files should be stored on the local file * system. Setting this path overrides the default behavior, where the files are stored across * the configured temp directories. * *

Passing {@code null} to this function restores the default behavior, where the configured * temp directories will be used. * * @param path The path where the local RocksDB database files are stored. */ public void setDbStoragePath(String path) { setDbStoragePaths(path == null ? null : new String[] {path}); } /** * Sets the directories in which the local RocksDB database puts its files (like SST and * metadata files). These directories do not need to be persistent, they can be ephemeral, * meaning that they are lost on a machine failure, because state in RocksDB is persisted in * checkpoints. * *

If nothing is configured, these directories default to the TaskManager's local temporary * file directories. * *

Each distinct state will be stored in one path, but when the state backend creates * multiple states, they will store their files on different paths. * *

Passing {@code null} to this function restores the default behavior, where the configured * temp directories will be used. * * @param paths The paths across which the local RocksDB database files will be spread. */ public void setDbStoragePaths(String... paths) { if (paths == null) { localRocksDbDirectories = null; } else if (paths.length == 0) { throw new IllegalArgumentException("empty paths"); } else { File[] pp = new File[paths.length]; for (int i = 0; i < paths.length; i++) { final String rawPath = paths[i]; final String path; if (rawPath == null) { throw new IllegalArgumentException("null path"); } else { // we need this for backwards compatibility, to allow URIs like 'file:///'... URI uri = null; try { uri = new Path(rawPath).toUri(); } catch (Exception e) { // cannot parse as a path } if (uri != null && uri.getScheme() != null) { if ("file".equalsIgnoreCase(uri.getScheme())) { path = uri.getPath(); } else { throw new IllegalArgumentException( "Path " + rawPath + " has a non-local scheme"); } } else { path = rawPath; } } pp[i] = new File(path); if (!pp[i].isAbsolute()) { throw new IllegalArgumentException("Relative paths are not supported"); } } localRocksDbDirectories = pp; } } /** * Gets the configured local DB storage paths, or null, if none were configured. * *

Under these directories on the TaskManager, RocksDB stores its SST files and metadata * files. These directories do not need to be persistent, they can be ephermeral, meaning that * they are lost on a machine failure, because state in RocksDB is persisted in checkpoints. * *

If nothing is configured, these directories default to the TaskManager's local temporary * file directories. */ public String[] getDbStoragePaths() { if (localRocksDbDirectories == null) { return null; } else { String[] paths = new String[localRocksDbDirectories.length]; for (int i = 0; i < paths.length; i++) { paths[i] = localRocksDbDirectories[i].toString(); } return paths; } } /** Gets whether incremental checkpoints are enabled for this state backend. */ public boolean isIncrementalCheckpointsEnabled() { return enableIncrementalCheckpointing.getOrDefault( CheckpointingOptions.INCREMENTAL_CHECKPOINTS.defaultValue()); } /** * Gets the type of the priority queue state. It will fallback to the default value, if it is * not explicitly set. * * @return The type of the priority queue state. */ public PriorityQueueStateType getPriorityQueueStateType() { return priorityQueueStateType == null ? TIMER_SERVICE_FACTORY.defaultValue() : priorityQueueStateType; } /** * Sets the type of the priority queue state. It will fallback to the default value, if it is * not explicitly set. */ public void setPriorityQueueStateType(PriorityQueueStateType priorityQueueStateType) { this.priorityQueueStateType = checkNotNull(priorityQueueStateType); } // ------------------------------------------------------------------------ // Parametrize with RocksDB Options // ------------------------------------------------------------------------ /** * Sets the predefined options for RocksDB. * *

If user-configured options within {@link RocksDBConfigurableOptions} is set (through * flink-conf.yaml) or a user-defined options factory is set (via {@link * #setRocksDBOptions(RocksDBOptionsFactory)}), then the options from the factory are applied on * top of the here specified predefined options and customized options. * * @param options The options to set (must not be null). */ public void setPredefinedOptions(@Nonnull PredefinedOptions options) { predefinedOptions = checkNotNull(options); } /** * Gets the currently set predefined options for RocksDB. The default options (if nothing was * set via {@link #setPredefinedOptions(PredefinedOptions)}) are {@link * PredefinedOptions#DEFAULT}. * *

If user-configured options within {@link RocksDBConfigurableOptions} is set (through * flink-conf.yaml) of a user-defined options factory is set (via {@link * #setRocksDBOptions(RocksDBOptionsFactory)}), then the options from the factory are applied on * top of the predefined and customized options. * * @return The currently set predefined options for RocksDB. */ @VisibleForTesting public PredefinedOptions getPredefinedOptions() { if (predefinedOptions == null) { predefinedOptions = PredefinedOptions.DEFAULT; } return predefinedOptions; } /** * Sets {@link org.rocksdb.Options} for the RocksDB instances. Because the options are not * serializable and hold native code references, they must be specified through a factory. * *

The options created by the factory here are applied on top of the pre-defined options * profile selected via {@link #setPredefinedOptions(PredefinedOptions)}. If the pre-defined * options profile is the default ({@link PredefinedOptions#DEFAULT}), then the factory fully * controls the RocksDB options. * * @param optionsFactory The options factory that lazily creates the RocksDB options. */ public void setRocksDBOptions(RocksDBOptionsFactory optionsFactory) { this.rocksDbOptionsFactory = optionsFactory; } /** * Gets {@link org.rocksdb.Options} for the RocksDB instances. * *

The options created by the factory here are applied on top of the pre-defined options * profile selected via {@link #setPredefinedOptions(PredefinedOptions)}. If the pre-defined * options profile is the default ({@link PredefinedOptions#DEFAULT}), then the factory fully * controls the RocksDB options. */ @Nullable public RocksDBOptionsFactory getRocksDBOptions() { return rocksDbOptionsFactory; } /** Gets the number of threads used to transfer files while snapshotting/restoring. */ public int getNumberOfTransferThreads() { return numberOfTransferThreads == UNDEFINED_NUMBER_OF_TRANSFER_THREADS ? CHECKPOINT_TRANSFER_THREAD_NUM.defaultValue() : numberOfTransferThreads; } /** * Sets the number of threads used to transfer files while snapshotting/restoring. * * @param numberOfTransferThreads The number of threads used to transfer files while * snapshotting/restoring. */ public void setNumberOfTransferThreads(int numberOfTransferThreads) { Preconditions.checkArgument( numberOfTransferThreads > 0, "The number of threads used to transfer files in EmbeddedRocksDBStateBackend should be greater than zero."); this.numberOfTransferThreads = numberOfTransferThreads; } /** Gets the max batch size will be used in {@link RocksDBWriteBatchWrapper}. */ public long getWriteBatchSize() { return writeBatchSize == UNDEFINED_WRITE_BATCH_SIZE ? WRITE_BATCH_SIZE.defaultValue().getBytes() : writeBatchSize; } /** * Sets the max batch size will be used in {@link RocksDBWriteBatchWrapper}, no positive value * will disable memory size controller, just use item count controller. * * @param writeBatchSize The size will used to be used in {@link RocksDBWriteBatchWrapper}. */ public void setWriteBatchSize(long writeBatchSize) { checkArgument(writeBatchSize >= 0, "Write batch size have to be no negative."); this.writeBatchSize = writeBatchSize; } // ------------------------------------------------------------------------ // utilities // ------------------------------------------------------------------------ @VisibleForTesting RocksDBResourceContainer createOptionsAndResourceContainer() { return createOptionsAndResourceContainer(null); } @VisibleForTesting private RocksDBResourceContainer createOptionsAndResourceContainer( @Nullable OpaqueMemoryResource sharedResources) { return new RocksDBResourceContainer( predefinedOptions != null ? predefinedOptions : PredefinedOptions.DEFAULT, rocksDbOptionsFactory, sharedResources); } @Override public String toString() { return "EmbeddedRocksDBStateBackend{" + ", localRocksDbDirectories=" + Arrays.toString(localRocksDbDirectories) + ", enableIncrementalCheckpointing=" + enableIncrementalCheckpointing + ", numberOfTransferThreads=" + numberOfTransferThreads + ", writeBatchSize=" + writeBatchSize + '}'; } // ------------------------------------------------------------------------ // static library loading utilities // ------------------------------------------------------------------------ @VisibleForTesting static void ensureRocksDBIsLoaded(String tempDirectory) throws IOException { synchronized (EmbeddedRocksDBStateBackend.class) { if (!rocksDbInitialized) { final File tempDirParent = new File(tempDirectory).getAbsoluteFile(); LOG.info( "Attempting to load RocksDB native library and store it under '{}'", tempDirParent); Throwable lastException = null; for (int attempt = 1; attempt <= ROCKSDB_LIB_LOADING_ATTEMPTS; attempt++) { File rocksLibFolder = null; try { // when multiple instances of this class and RocksDB exist in different // class loaders, then we can see the following exception: // "java.lang.UnsatisfiedLinkError: Native Library // /path/to/temp/dir/librocksdbjni-linux64.so // already loaded in another class loader" // to avoid that, we need to add a random element to the library file path // (I know, seems like an unnecessary hack, since the JVM obviously can // handle multiple // instances of the same JNI library being loaded in different class // loaders, but // apparently not when coming from the same file path, so there we go) rocksLibFolder = new File(tempDirParent, "rocksdb-lib-" + new AbstractID()); // make sure the temp path exists LOG.debug( "Attempting to create RocksDB native library folder {}", rocksLibFolder); // noinspection ResultOfMethodCallIgnored rocksLibFolder.mkdirs(); // explicitly load the JNI dependency if it has not been loaded before NativeLibraryLoader.getInstance() .loadLibrary(rocksLibFolder.getAbsolutePath()); // this initialization here should validate that the loading succeeded RocksDB.loadLibrary(); // seems to have worked LOG.info("Successfully loaded RocksDB native library"); rocksDbInitialized = true; return; } catch (Throwable t) { lastException = t; LOG.debug("RocksDB JNI library loading attempt {} failed", attempt, t); // try to force RocksDB to attempt reloading the library try { resetRocksDBLoadedFlag(); } catch (Throwable tt) { LOG.debug( "Failed to reset 'initialized' flag in RocksDB native code loader", tt); } FileUtils.deleteDirectoryQuietly(rocksLibFolder); } } throw new IOException("Could not load the native RocksDB library", lastException); } } } @VisibleForTesting static void resetRocksDBLoadedFlag() throws Exception { final Field initField = org.rocksdb.NativeLibraryLoader.class.getDeclaredField("initialized"); initField.setAccessible(true); initField.setBoolean(null, false); } // --------------------------------------------------------------------------------------------- // Enums // --------------------------------------------------------------------------------------------- /** The options to chose for the type of priority queue state. */ public enum PriorityQueueStateType implements DescribedEnum { HEAP(text("Heap-based")), ROCKSDB(text("Implementation based on RocksDB")); private final InlineElement description; PriorityQueueStateType(InlineElement description) { this.description = description; } @Override public InlineElement getDescription() { return description; } } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.14/org/apache/flink/runtime/checkpoint/CheckpointCoordinator.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.runtime.checkpoint; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.JobID; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.runtime.checkpoint.CheckpointType.PostCheckpointAction; import org.apache.flink.runtime.checkpoint.hooks.MasterHooks; import org.apache.flink.runtime.executiongraph.Execution; import org.apache.flink.runtime.executiongraph.ExecutionAttemptID; import org.apache.flink.runtime.executiongraph.ExecutionJobVertex; import org.apache.flink.runtime.executiongraph.ExecutionVertex; import org.apache.flink.runtime.executiongraph.JobStatusListener; import org.apache.flink.runtime.jobgraph.JobVertexID; import org.apache.flink.runtime.jobgraph.OperatorID; import org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration; import org.apache.flink.runtime.messages.Acknowledge; import org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint; import org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint; import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; import org.apache.flink.runtime.operators.coordination.OperatorInfo; import org.apache.flink.runtime.persistence.PossibleInconsistentStateException; import org.apache.flink.runtime.state.CheckpointStorage; import org.apache.flink.runtime.state.CheckpointStorageCoordinatorView; import org.apache.flink.runtime.state.CheckpointStorageLocation; import org.apache.flink.runtime.state.CompletedCheckpointStorageLocation; import org.apache.flink.runtime.state.SharedStateRegistry; import org.apache.flink.runtime.state.SharedStateRegistryFactory; import org.apache.flink.runtime.state.memory.ByteStreamStateHandle; import org.apache.flink.util.ExceptionUtils; import org.apache.flink.util.FlinkRuntimeException; import org.apache.flink.util.Preconditions; import org.apache.flink.util.StringUtils; import org.apache.flink.util.clock.Clock; import org.apache.flink.util.clock.SystemClock; import org.apache.flink.util.concurrent.FutureUtils; import org.apache.flink.util.concurrent.ScheduledExecutor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import javax.annotation.concurrent.GuardedBy; import java.io.IOException; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.OptionalLong; import java.util.PriorityQueue; import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletionException; import java.util.concurrent.Executor; import java.util.concurrent.RejectedExecutionException; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Predicate; import java.util.stream.Stream; import static java.util.stream.Collectors.toMap; import static org.apache.flink.util.ExceptionUtils.findThrowable; import static org.apache.flink.util.Preconditions.checkArgument; import static org.apache.flink.util.Preconditions.checkNotNull; /** * The checkpoint coordinator coordinates the distributed snapshots of operators and state. It * triggers the checkpoint by sending the messages to the relevant tasks and collects the checkpoint * acknowledgements. It also collects and maintains the overview of the state handles reported by * the tasks that acknowledge the checkpoint. */ public class CheckpointCoordinator { private static final Logger LOG = LoggerFactory.getLogger(CheckpointCoordinator.class); /** The number of recent checkpoints whose IDs are remembered. */ private static final int NUM_GHOST_CHECKPOINT_IDS = 16; // ------------------------------------------------------------------------ /** Coordinator-wide lock to safeguard the checkpoint updates. */ private final Object lock = new Object(); /** The job whose checkpoint this coordinator coordinates. */ private final JobID job; /** Default checkpoint properties. * */ private final CheckpointProperties checkpointProperties; /** The executor used for asynchronous calls, like potentially blocking I/O. */ private final Executor executor; private final CheckpointsCleaner checkpointsCleaner; /** The operator coordinators that need to be checkpointed. */ private final Collection coordinatorsToCheckpoint; /** Map from checkpoint ID to the pending checkpoint. */ @GuardedBy("lock") private final Map pendingCheckpoints; /** * Completed checkpoints. Implementations can be blocking. Make sure calls to methods accessing * this don't block the job manager actor and run asynchronously. */ private final CompletedCheckpointStore completedCheckpointStore; /** * The root checkpoint state backend, which is responsible for initializing the checkpoint, * storing the metadata, and cleaning up the checkpoint. */ private final CheckpointStorageCoordinatorView checkpointStorageView; /** A list of recent checkpoint IDs, to identify late messages (vs invalid ones). */ private final ArrayDeque recentPendingCheckpoints; /** * Checkpoint ID counter to ensure ascending IDs. In case of job manager failures, these need to * be ascending across job managers. */ private final CheckpointIDCounter checkpointIdCounter; // TODO: ------------ start:二次开发代码 --------------- // /** * The base checkpoint interval. Actual trigger time may be affected by the max concurrent * checkpoints and minimum-pause values */ private long baseInterval; /** The max time (in ms) that a checkpoint may take. */ private long checkpointTimeout; /** * The min time(in ms) to delay after a checkpoint could be triggered. Allows to enforce minimum * processing time between checkpoint attempts */ private long minPauseBetweenCheckpoints; public long getBaseInterval() { return baseInterval; } public void setBaseInterval(long baseInterval) { this.baseInterval = baseInterval; } public void setCheckpointTimeout(long checkpointTimeout) { this.checkpointTimeout = checkpointTimeout; } public long getMinPauseBetweenCheckpoints() { return minPauseBetweenCheckpoints; } public void setMinPauseBetweenCheckpoints(long minPauseBetweenCheckpoints) { this.minPauseBetweenCheckpoints = minPauseBetweenCheckpoints; } private static CheckpointCoordinator coordinator; public static CheckpointCoordinator getInstance() { return CheckpointCoordinator.coordinator; } // TODO: ------------ end:二次开发代码 ----------------- // /** * The timer that handles the checkpoint timeouts and triggers periodic checkpoints. It must be * single-threaded. Eventually it will be replaced by main thread executor. */ private final ScheduledExecutor timer; /** The master checkpoint hooks executed by this checkpoint coordinator. */ private final HashMap> masterHooks; private final boolean unalignedCheckpointsEnabled; private final long alignedCheckpointTimeout; /** Actor that receives status updates from the execution graph this coordinator works for. */ private JobStatusListener jobStatusListener; /** The number of consecutive failed trigger attempts. */ private final AtomicInteger numUnsuccessfulCheckpointsTriggers = new AtomicInteger(0); /** A handle to the current periodic trigger, to cancel it when necessary. */ private ScheduledFuture currentPeriodicTrigger; /** * The timestamp (via {@link Clock#relativeTimeMillis()}) when the last checkpoint completed. */ private long lastCheckpointCompletionRelativeTime; /** * Flag whether a triggered checkpoint should immediately schedule the next checkpoint. * Non-volatile, because only accessed in synchronized scope */ private boolean periodicScheduling; /** Flag marking the coordinator as shut down (not accepting any messages any more). */ private volatile boolean shutdown; /** Optional tracker for checkpoint statistics. */ @Nullable private CheckpointStatsTracker statsTracker; /** A factory for SharedStateRegistry objects. */ private final SharedStateRegistryFactory sharedStateRegistryFactory; /** Registry that tracks state which is shared across (incremental) checkpoints. */ private SharedStateRegistry sharedStateRegistry; /** Id of checkpoint for which in-flight data should be ignored on recovery. */ private final long checkpointIdOfIgnoredInFlightData; private final CheckpointFailureManager failureManager; private final Clock clock; private final boolean isExactlyOnceMode; /** Flag represents there is an in-flight trigger request. */ private boolean isTriggering = false; private final CheckpointRequestDecider requestDecider; private final CheckpointPlanCalculator checkpointPlanCalculator; private final ExecutionAttemptMappingProvider attemptMappingProvider; // -------------------------------------------------------------------------------------------- public CheckpointCoordinator( JobID job, CheckpointCoordinatorConfiguration chkConfig, Collection coordinatorsToCheckpoint, CheckpointIDCounter checkpointIDCounter, CompletedCheckpointStore completedCheckpointStore, CheckpointStorage checkpointStorage, Executor executor, CheckpointsCleaner checkpointsCleaner, ScheduledExecutor timer, SharedStateRegistryFactory sharedStateRegistryFactory, CheckpointFailureManager failureManager, CheckpointPlanCalculator checkpointPlanCalculator, ExecutionAttemptMappingProvider attemptMappingProvider) { this( job, chkConfig, coordinatorsToCheckpoint, checkpointIDCounter, completedCheckpointStore, checkpointStorage, executor, checkpointsCleaner, timer, sharedStateRegistryFactory, failureManager, checkpointPlanCalculator, attemptMappingProvider, SystemClock.getInstance()); } @VisibleForTesting public CheckpointCoordinator( JobID job, CheckpointCoordinatorConfiguration chkConfig, Collection coordinatorsToCheckpoint, CheckpointIDCounter checkpointIDCounter, CompletedCheckpointStore completedCheckpointStore, CheckpointStorage checkpointStorage, Executor executor, CheckpointsCleaner checkpointsCleaner, ScheduledExecutor timer, SharedStateRegistryFactory sharedStateRegistryFactory, CheckpointFailureManager failureManager, CheckpointPlanCalculator checkpointPlanCalculator, ExecutionAttemptMappingProvider attemptMappingProvider, Clock clock) { // sanity checks checkNotNull(checkpointStorage); // max "in between duration" can be one year - this is to prevent numeric overflows long minPauseBetweenCheckpoints = chkConfig.getMinPauseBetweenCheckpoints(); if (minPauseBetweenCheckpoints > 365L * 24 * 60 * 60 * 1_000) { minPauseBetweenCheckpoints = 365L * 24 * 60 * 60 * 1_000; } // it does not make sense to schedule checkpoints more often then the desired // time between checkpoints long baseInterval = chkConfig.getCheckpointInterval(); if (baseInterval < minPauseBetweenCheckpoints) { baseInterval = minPauseBetweenCheckpoints; } this.job = checkNotNull(job); this.baseInterval = baseInterval; this.checkpointTimeout = chkConfig.getCheckpointTimeout(); this.minPauseBetweenCheckpoints = minPauseBetweenCheckpoints; this.coordinatorsToCheckpoint = Collections.unmodifiableCollection(coordinatorsToCheckpoint); this.pendingCheckpoints = new LinkedHashMap<>(); this.checkpointIdCounter = checkNotNull(checkpointIDCounter); this.completedCheckpointStore = checkNotNull(completedCheckpointStore); this.executor = checkNotNull(executor); this.checkpointsCleaner = checkNotNull(checkpointsCleaner); this.sharedStateRegistryFactory = checkNotNull(sharedStateRegistryFactory); this.sharedStateRegistry = sharedStateRegistryFactory.create(executor); this.failureManager = checkNotNull(failureManager); this.checkpointPlanCalculator = checkNotNull(checkpointPlanCalculator); this.attemptMappingProvider = checkNotNull(attemptMappingProvider); this.clock = checkNotNull(clock); this.isExactlyOnceMode = chkConfig.isExactlyOnce(); this.unalignedCheckpointsEnabled = chkConfig.isUnalignedCheckpointsEnabled(); this.alignedCheckpointTimeout = chkConfig.getAlignedCheckpointTimeout(); this.checkpointIdOfIgnoredInFlightData = chkConfig.getCheckpointIdOfIgnoredInFlightData(); this.recentPendingCheckpoints = new ArrayDeque<>(NUM_GHOST_CHECKPOINT_IDS); this.masterHooks = new HashMap<>(); this.timer = timer; this.checkpointProperties = CheckpointProperties.forCheckpoint(chkConfig.getCheckpointRetentionPolicy()); try { this.checkpointStorageView = checkpointStorage.createCheckpointStorage(job); checkpointStorageView.initializeBaseLocations(); } catch (IOException e) { throw new FlinkRuntimeException( "Failed to create checkpoint storage at checkpoint coordinator side.", e); } try { // Make sure the checkpoint ID enumerator is running. Possibly // issues a blocking call to ZooKeeper. checkpointIDCounter.start(); } catch (Throwable t) { throw new RuntimeException( "Failed to start checkpoint ID counter: " + t.getMessage(), t); } this.requestDecider = new CheckpointRequestDecider( chkConfig.getMaxConcurrentCheckpoints(), this::rescheduleTrigger, this.clock, this.minPauseBetweenCheckpoints, this.pendingCheckpoints::size, this.checkpointsCleaner::getNumberOfCheckpointsToClean); // TODO: ------------ start:二次开发代码 --------------- // CheckpointCoordinator.coordinator = this; // TODO: ------------ end:二次开发代码 --------------- // } // -------------------------------------------------------------------------------------------- // Configuration // -------------------------------------------------------------------------------------------- /** * Adds the given master hook to the checkpoint coordinator. This method does nothing, if the * checkpoint coordinator already contained a hook with the same ID (as defined via {@link * MasterTriggerRestoreHook#getIdentifier()}). * * @param hook The hook to add. * @return True, if the hook was added, false if the checkpoint coordinator already contained a * hook with the same ID. */ public boolean addMasterHook(MasterTriggerRestoreHook hook) { checkNotNull(hook); final String id = hook.getIdentifier(); checkArgument(!StringUtils.isNullOrWhitespaceOnly(id), "The hook has a null or empty id"); synchronized (lock) { if (!masterHooks.containsKey(id)) { masterHooks.put(id, hook); return true; } else { return false; } } } /** Gets the number of currently register master hooks. */ public int getNumberOfRegisteredMasterHooks() { synchronized (lock) { return masterHooks.size(); } } /** * Sets the checkpoint stats tracker. * * @param statsTracker The checkpoint stats tracker. */ public void setCheckpointStatsTracker(@Nullable CheckpointStatsTracker statsTracker) { this.statsTracker = statsTracker; } // -------------------------------------------------------------------------------------------- // Clean shutdown // -------------------------------------------------------------------------------------------- /** * Shuts down the checkpoint coordinator. * *

After this method has been called, the coordinator does not accept and further messages * and cannot trigger any further checkpoints. */ public void shutdown() throws Exception { synchronized (lock) { if (!shutdown) { shutdown = true; LOG.info("Stopping checkpoint coordinator for job {}.", job); periodicScheduling = false; // shut down the hooks MasterHooks.close(masterHooks.values(), LOG); masterHooks.clear(); final CheckpointException reason = new CheckpointException( CheckpointFailureReason.CHECKPOINT_COORDINATOR_SHUTDOWN); // clear queued requests and in-flight checkpoints abortPendingAndQueuedCheckpoints(reason); } } } public boolean isShutdown() { return shutdown; } // -------------------------------------------------------------------------------------------- // Triggering Checkpoints and Savepoints // -------------------------------------------------------------------------------------------- /** * Triggers a savepoint with the given savepoint directory as a target. * * @param targetLocation Target location for the savepoint, optional. If null, the state * backend's configured default will be used. * @return A future to the completed checkpoint * @throws IllegalStateException If no savepoint directory has been specified and no default * savepoint directory has been configured */ public CompletableFuture triggerSavepoint( @Nullable final String targetLocation) { final CheckpointProperties properties = CheckpointProperties.forSavepoint(!unalignedCheckpointsEnabled); return triggerSavepointInternal(properties, targetLocation); } /** * Triggers a synchronous savepoint with the given savepoint directory as a target. * * @param terminate flag indicating if the job should terminate or just suspend * @param targetLocation Target location for the savepoint, optional. If null, the state * backend's configured default will be used. * @return A future to the completed checkpoint * @throws IllegalStateException If no savepoint directory has been specified and no default * savepoint directory has been configured */ public CompletableFuture triggerSynchronousSavepoint( final boolean terminate, @Nullable final String targetLocation) { final CheckpointProperties properties = CheckpointProperties.forSyncSavepoint(!unalignedCheckpointsEnabled, terminate); return triggerSavepointInternal(properties, targetLocation); } private CompletableFuture triggerSavepointInternal( final CheckpointProperties checkpointProperties, @Nullable final String targetLocation) { checkNotNull(checkpointProperties); // TODO, call triggerCheckpoint directly after removing timer thread // for now, execute the trigger in timer thread to avoid competition final CompletableFuture resultFuture = new CompletableFuture<>(); timer.execute( () -> triggerCheckpoint(checkpointProperties, targetLocation, false) .whenComplete( (completedCheckpoint, throwable) -> { if (throwable == null) { resultFuture.complete(completedCheckpoint); } else { resultFuture.completeExceptionally(throwable); } })); return resultFuture; } /** * Triggers a new standard checkpoint and uses the given timestamp as the checkpoint timestamp. * The return value is a future. It completes when the checkpoint triggered finishes or an error * occurred. * * @param isPeriodic Flag indicating whether this triggered checkpoint is periodic. If this flag * is true, but the periodic scheduler is disabled, the checkpoint will be declined. * @return a future to the completed checkpoint. */ public CompletableFuture triggerCheckpoint(boolean isPeriodic) { return triggerCheckpoint(checkpointProperties, null, isPeriodic); } @VisibleForTesting public CompletableFuture triggerCheckpoint( CheckpointProperties props, @Nullable String externalSavepointLocation, boolean isPeriodic) { if (props.getCheckpointType().getPostCheckpointAction() == PostCheckpointAction.TERMINATE && !(props.isSynchronous() && props.isSavepoint())) { return FutureUtils.completedExceptionally( new IllegalArgumentException( "Only synchronous savepoints are allowed to advance the watermark to MAX.")); } CheckpointTriggerRequest request = new CheckpointTriggerRequest(props, externalSavepointLocation, isPeriodic); chooseRequestToExecute(request).ifPresent(this::startTriggeringCheckpoint); return request.onCompletionPromise; } private void startTriggeringCheckpoint(CheckpointTriggerRequest request) { try { synchronized (lock) { preCheckGlobalState(request.isPeriodic); } // we will actually trigger this checkpoint! Preconditions.checkState(!isTriggering); isTriggering = true; final long timestamp = System.currentTimeMillis(); CompletableFuture checkpointPlanFuture = checkpointPlanCalculator.calculateCheckpointPlan(); final CompletableFuture pendingCheckpointCompletableFuture = checkpointPlanFuture .thenApplyAsync( plan -> { try { CheckpointIdAndStorageLocation checkpointIdAndStorageLocation = initializeCheckpoint( request.props, request.externalSavepointLocation); return new Tuple2<>( plan, checkpointIdAndStorageLocation); } catch (Throwable e) { throw new CompletionException(e); } }, executor) .thenApplyAsync( (checkpointInfo) -> createPendingCheckpoint( timestamp, request.props, checkpointInfo.f0, request.isPeriodic, checkpointInfo.f1.checkpointId, checkpointInfo.f1.checkpointStorageLocation, request.getOnCompletionFuture()), timer); final CompletableFuture coordinatorCheckpointsComplete = pendingCheckpointCompletableFuture.thenComposeAsync( (pendingCheckpoint) -> OperatorCoordinatorCheckpoints .triggerAndAcknowledgeAllCoordinatorCheckpointsWithCompletion( coordinatorsToCheckpoint, pendingCheckpoint, timer), timer); // We have to take the snapshot of the master hooks after the coordinator checkpoints // has completed. // This is to ensure the tasks are checkpointed after the OperatorCoordinators in case // ExternallyInducedSource is used. final CompletableFuture masterStatesComplete = coordinatorCheckpointsComplete.thenComposeAsync( ignored -> { // If the code reaches here, the pending checkpoint is guaranteed to // be not null. // We use FutureUtils.getWithoutException() to make compiler happy // with checked // exceptions in the signature. PendingCheckpoint checkpoint = FutureUtils.getWithoutException( pendingCheckpointCompletableFuture); return snapshotMasterState(checkpoint); }, timer); FutureUtils.assertNoException( CompletableFuture.allOf(masterStatesComplete, coordinatorCheckpointsComplete) .handleAsync( (ignored, throwable) -> { final PendingCheckpoint checkpoint = FutureUtils.getWithoutException( pendingCheckpointCompletableFuture); Preconditions.checkState( checkpoint != null || throwable != null, "Either the pending checkpoint needs to be created or an error must have occurred."); if (throwable != null) { // the initialization might not be finished yet if (checkpoint == null) { onTriggerFailure(request, throwable); } else { onTriggerFailure(checkpoint, throwable); } } else { triggerCheckpointRequest( request, timestamp, checkpoint); } return null; }, timer) .exceptionally( error -> { if (!isShutdown()) { throw new CompletionException(error); } else if (findThrowable( error, RejectedExecutionException.class) .isPresent()) { LOG.debug("Execution rejected during shutdown"); } else { LOG.warn("Error encountered during shutdown", error); } return null; })); } catch (Throwable throwable) { onTriggerFailure(request, throwable); } } private void triggerCheckpointRequest( CheckpointTriggerRequest request, long timestamp, PendingCheckpoint checkpoint) { if (checkpoint.isDisposed()) { onTriggerFailure( checkpoint, new CheckpointException( CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE, checkpoint.getFailureCause())); } else { triggerTasks(request, timestamp, checkpoint) .exceptionally( failure -> { LOG.info( "Triggering Checkpoint {} for job {} failed due to {}", checkpoint.getCheckpointID(), job, failure); final CheckpointException cause; if (failure instanceof CheckpointException) { cause = (CheckpointException) failure; } else { cause = new CheckpointException( CheckpointFailureReason .TRIGGER_CHECKPOINT_FAILURE, failure); } timer.execute( () -> { synchronized (lock) { abortPendingCheckpoint(checkpoint, cause); } }); return null; }); coordinatorsToCheckpoint.forEach( (ctx) -> ctx.afterSourceBarrierInjection(checkpoint.getCheckpointID())); // It is possible that the tasks has finished // checkpointing at this point. // So we need to complete this pending checkpoint. if (maybeCompleteCheckpoint(checkpoint)) { onTriggerSuccess(); } } } private CompletableFuture triggerTasks( CheckpointTriggerRequest request, long timestamp, PendingCheckpoint checkpoint) { // no exception, no discarding, everything is OK final long checkpointId = checkpoint.getCheckpointID(); final CheckpointOptions checkpointOptions = CheckpointOptions.forConfig( request.props.getCheckpointType(), checkpoint.getCheckpointStorageLocation().getLocationReference(), isExactlyOnceMode, unalignedCheckpointsEnabled, alignedCheckpointTimeout); // send messages to the tasks to trigger their checkpoints List> acks = new ArrayList<>(); for (Execution execution : checkpoint.getCheckpointPlan().getTasksToTrigger()) { if (request.props.isSynchronous()) { acks.add( execution.triggerSynchronousSavepoint( checkpointId, timestamp, checkpointOptions)); } else { acks.add(execution.triggerCheckpoint(checkpointId, timestamp, checkpointOptions)); } } return FutureUtils.waitForAll(acks); } /** * Initialize the checkpoint trigger asynchronously. It will expected to be executed in io * thread due to it might be time-consuming. * * @param props checkpoint properties * @param externalSavepointLocation the external savepoint location, it might be null * @return the initialized result, checkpoint id and checkpoint location */ private CheckpointIdAndStorageLocation initializeCheckpoint( CheckpointProperties props, @Nullable String externalSavepointLocation) throws Exception { // this must happen outside the coordinator-wide lock, because it // communicates // with external services (in HA mode) and may block for a while. long checkpointID = checkpointIdCounter.getAndIncrement(); CheckpointStorageLocation checkpointStorageLocation = props.isSavepoint() ? checkpointStorageView.initializeLocationForSavepoint( checkpointID, externalSavepointLocation) : checkpointStorageView.initializeLocationForCheckpoint(checkpointID); return new CheckpointIdAndStorageLocation(checkpointID, checkpointStorageLocation); } private PendingCheckpoint createPendingCheckpoint( long timestamp, CheckpointProperties props, CheckpointPlan checkpointPlan, boolean isPeriodic, long checkpointID, CheckpointStorageLocation checkpointStorageLocation, CompletableFuture onCompletionPromise) { synchronized (lock) { try { // since we haven't created the PendingCheckpoint yet, we need to check the // global state here. preCheckGlobalState(isPeriodic); } catch (Throwable t) { throw new CompletionException(t); } } final PendingCheckpoint checkpoint = new PendingCheckpoint( job, checkpointID, timestamp, checkpointPlan, OperatorInfo.getIds(coordinatorsToCheckpoint), masterHooks.keySet(), props, checkpointStorageLocation, onCompletionPromise); trackPendingCheckpointStats(checkpoint); synchronized (lock) { pendingCheckpoints.put(checkpointID, checkpoint); ScheduledFuture cancellerHandle = timer.schedule( new CheckpointCanceller(checkpoint), checkpointTimeout, TimeUnit.MILLISECONDS); if (!checkpoint.setCancellerHandle(cancellerHandle)) { // checkpoint is already disposed! cancellerHandle.cancel(false); } } LOG.info( "Triggering checkpoint {} (type={}) @ {} for job {}.", checkpointID, checkpoint.getProps().getCheckpointType(), timestamp, job); return checkpoint; } /** * Snapshot master hook states asynchronously. * * @param checkpoint the pending checkpoint * @return the future represents master hook states are finished or not */ private CompletableFuture snapshotMasterState(PendingCheckpoint checkpoint) { if (masterHooks.isEmpty()) { return CompletableFuture.completedFuture(null); } final long checkpointID = checkpoint.getCheckpointId(); final long timestamp = checkpoint.getCheckpointTimestamp(); final CompletableFuture masterStateCompletableFuture = new CompletableFuture<>(); for (MasterTriggerRestoreHook masterHook : masterHooks.values()) { MasterHooks.triggerHook(masterHook, checkpointID, timestamp, executor) .whenCompleteAsync( (masterState, throwable) -> { try { synchronized (lock) { if (masterStateCompletableFuture.isDone()) { return; } if (checkpoint.isDisposed()) { throw new IllegalStateException( "Checkpoint " + checkpointID + " has been discarded"); } if (throwable == null) { checkpoint.acknowledgeMasterState( masterHook.getIdentifier(), masterState); if (checkpoint.areMasterStatesFullyAcknowledged()) { masterStateCompletableFuture.complete(null); } } else { masterStateCompletableFuture.completeExceptionally( throwable); } } } catch (Throwable t) { masterStateCompletableFuture.completeExceptionally(t); } }, timer); } return masterStateCompletableFuture; } /** Trigger request is successful. NOTE, it must be invoked if trigger request is successful. */ private void onTriggerSuccess() { isTriggering = false; numUnsuccessfulCheckpointsTriggers.set(0); executeQueuedRequest(); } /** * The trigger request is failed prematurely without a proper initialization. There is no * resource to release, but the completion promise needs to fail manually here. * * @param onCompletionPromise the completion promise of the checkpoint/savepoint * @param throwable the reason of trigger failure */ private void onTriggerFailure( CheckpointTriggerRequest onCompletionPromise, Throwable throwable) { final CheckpointException checkpointException = getCheckpointException( CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE, throwable); onCompletionPromise.completeExceptionally(checkpointException); onTriggerFailure((PendingCheckpoint) null, onCompletionPromise.props, checkpointException); } private void onTriggerFailure(PendingCheckpoint checkpoint, Throwable throwable) { checkArgument(checkpoint != null, "Pending checkpoint can not be null."); onTriggerFailure(checkpoint, checkpoint.getProps(), throwable); } /** * The trigger request is failed. NOTE, it must be invoked if trigger request is failed. * * @param checkpoint the pending checkpoint which is failed. It could be null if it's failed * prematurely without a proper initialization. * @param throwable the reason of trigger failure */ private void onTriggerFailure( @Nullable PendingCheckpoint checkpoint, CheckpointProperties checkpointProperties, Throwable throwable) { // beautify the stack trace a bit throwable = ExceptionUtils.stripCompletionException(throwable); try { coordinatorsToCheckpoint.forEach( OperatorCoordinatorCheckpointContext::abortCurrentTriggering); final CheckpointException cause = getCheckpointException( CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE, throwable); if (checkpoint != null && !checkpoint.isDisposed()) { int numUnsuccessful = numUnsuccessfulCheckpointsTriggers.incrementAndGet(); LOG.warn( "Failed to trigger checkpoint {} for job {}. ({} consecutive failed attempts so far)", checkpoint.getCheckpointId(), job, numUnsuccessful, throwable); synchronized (lock) { abortPendingCheckpoint(checkpoint, cause); } } else { LOG.info( "Failed to trigger checkpoint for job {} because {}", job, throwable.getMessage()); failureManager.handleCheckpointException( checkpoint, checkpointProperties, cause, null); } } finally { isTriggering = false; executeQueuedRequest(); } } private void executeQueuedRequest() { chooseQueuedRequestToExecute().ifPresent(this::startTriggeringCheckpoint); } private Optional chooseQueuedRequestToExecute() { synchronized (lock) { return requestDecider.chooseQueuedRequestToExecute( isTriggering, lastCheckpointCompletionRelativeTime); } } private Optional chooseRequestToExecute( CheckpointTriggerRequest request) { synchronized (lock) { return requestDecider.chooseRequestToExecute( request, isTriggering, lastCheckpointCompletionRelativeTime); } } // Returns true if the checkpoint is successfully completed, false otherwise. private boolean maybeCompleteCheckpoint(PendingCheckpoint checkpoint) { synchronized (lock) { if (checkpoint.isFullyAcknowledged()) { try { // we need to check inside the lock for being shutdown as well, // otherwise we get races and invalid error log messages. if (shutdown) { return false; } completePendingCheckpoint(checkpoint); } catch (CheckpointException ce) { onTriggerFailure(checkpoint, ce); return false; } } } return true; } // -------------------------------------------------------------------------------------------- // Handling checkpoints and messages // -------------------------------------------------------------------------------------------- /** * Receives a {@link DeclineCheckpoint} message for a pending checkpoint. * * @param message Checkpoint decline from the task manager * @param taskManagerLocationInfo The location info of the decline checkpoint message's sender */ public void receiveDeclineMessage(DeclineCheckpoint message, String taskManagerLocationInfo) { if (shutdown || message == null) { return; } if (!job.equals(message.getJob())) { throw new IllegalArgumentException( "Received DeclineCheckpoint message for job " + message.getJob() + " from " + taskManagerLocationInfo + " while this coordinator handles job " + job); } final long checkpointId = message.getCheckpointId(); final CheckpointException checkpointException = message.getSerializedCheckpointException().unwrap(); final String reason = checkpointException.getMessage(); PendingCheckpoint checkpoint; synchronized (lock) { // we need to check inside the lock for being shutdown as well, otherwise we // get races and invalid error log messages if (shutdown) { return; } checkpoint = pendingCheckpoints.get(checkpointId); if (checkpoint != null) { Preconditions.checkState( !checkpoint.isDisposed(), "Received message for discarded but non-removed checkpoint " + checkpointId); LOG.info( "Decline checkpoint {} by task {} of job {} at {}.", checkpointId, message.getTaskExecutionId(), job, taskManagerLocationInfo, checkpointException.getCause()); abortPendingCheckpoint( checkpoint, checkpointException, message.getTaskExecutionId()); } else if (LOG.isDebugEnabled()) { if (recentPendingCheckpoints.contains(checkpointId)) { // message is for an unknown checkpoint, or comes too late (checkpoint disposed) LOG.debug( "Received another decline message for now expired checkpoint attempt {} from task {} of job {} at {} : {}", checkpointId, message.getTaskExecutionId(), job, taskManagerLocationInfo, reason); } else { // message is for an unknown checkpoint. might be so old that we don't even // remember it any more LOG.debug( "Received decline message for unknown (too old?) checkpoint attempt {} from task {} of job {} at {} : {}", checkpointId, message.getTaskExecutionId(), job, taskManagerLocationInfo, reason); } } } } /** * Receives an AcknowledgeCheckpoint message and returns whether the message was associated with * a pending checkpoint. * * @param message Checkpoint ack from the task manager * @param taskManagerLocationInfo The location of the acknowledge checkpoint message's sender * @return Flag indicating whether the ack'd checkpoint was associated with a pending * checkpoint. * @throws CheckpointException If the checkpoint cannot be added to the completed checkpoint * store. */ public boolean receiveAcknowledgeMessage( AcknowledgeCheckpoint message, String taskManagerLocationInfo) throws CheckpointException { if (shutdown || message == null) { return false; } if (!job.equals(message.getJob())) { LOG.error( "Received wrong AcknowledgeCheckpoint message for job {} from {} : {}", job, taskManagerLocationInfo, message); return false; } final long checkpointId = message.getCheckpointId(); synchronized (lock) { // we need to check inside the lock for being shutdown as well, otherwise we // get races and invalid error log messages if (shutdown) { return false; } final PendingCheckpoint checkpoint = pendingCheckpoints.get(checkpointId); if (checkpoint != null && !checkpoint.isDisposed()) { switch (checkpoint.acknowledgeTask( message.getTaskExecutionId(), message.getSubtaskState(), message.getCheckpointMetrics(), getStatsCallback(checkpoint))) { case SUCCESS: LOG.debug( "Received acknowledge message for checkpoint {} from task {} of job {} at {}.", checkpointId, message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); if (checkpoint.isFullyAcknowledged()) { completePendingCheckpoint(checkpoint); } break; case DUPLICATE: LOG.debug( "Received a duplicate acknowledge message for checkpoint {}, task {}, job {}, location {}.", message.getCheckpointId(), message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); break; case UNKNOWN: LOG.warn( "Could not acknowledge the checkpoint {} for task {} of job {} at {}, " + "because the task's execution attempt id was unknown. Discarding " + "the state handle to avoid lingering state.", message.getCheckpointId(), message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); discardSubtaskState( message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState()); break; case DISCARDED: LOG.warn( "Could not acknowledge the checkpoint {} for task {} of job {} at {}, " + "because the pending checkpoint had been discarded. Discarding the " + "state handle tp avoid lingering state.", message.getCheckpointId(), message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); discardSubtaskState( message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState()); } return true; } else if (checkpoint != null) { // this should not happen throw new IllegalStateException( "Received message for discarded but non-removed checkpoint " + checkpointId); } else { reportStats( message.getCheckpointId(), message.getTaskExecutionId(), message.getCheckpointMetrics()); boolean wasPendingCheckpoint; // message is for an unknown checkpoint, or comes too late (checkpoint disposed) if (recentPendingCheckpoints.contains(checkpointId)) { wasPendingCheckpoint = true; LOG.warn( "Received late message for now expired checkpoint attempt {} from task " + "{} of job {} at {}.", checkpointId, message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); } else { LOG.debug( "Received message for an unknown checkpoint {} from task {} of job {} at {}.", checkpointId, message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo); wasPendingCheckpoint = false; } // try to discard the state so that we don't have lingering state lying around discardSubtaskState( message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState()); return wasPendingCheckpoint; } } } /** * Try to complete the given pending checkpoint. * *

Important: This method should only be called in the checkpoint lock scope. * * @param pendingCheckpoint to complete * @throws CheckpointException if the completion failed */ private void completePendingCheckpoint(PendingCheckpoint pendingCheckpoint) throws CheckpointException { final long checkpointId = pendingCheckpoint.getCheckpointId(); final CompletedCheckpoint completedCheckpoint; // As a first step to complete the checkpoint, we register its state with the registry Map operatorStates = pendingCheckpoint.getOperatorStates(); sharedStateRegistry.registerAll(operatorStates.values()); try { try { completedCheckpoint = pendingCheckpoint.finalizeCheckpoint( checkpointsCleaner, this::scheduleTriggerRequest, executor, getStatsCallback(pendingCheckpoint)); failureManager.handleCheckpointSuccess(pendingCheckpoint.getCheckpointId()); } catch (Exception e1) { // abort the current pending checkpoint if we fails to finalize the pending // checkpoint. if (!pendingCheckpoint.isDisposed()) { abortPendingCheckpoint( pendingCheckpoint, new CheckpointException( CheckpointFailureReason.FINALIZE_CHECKPOINT_FAILURE, e1)); } throw new CheckpointException( "Could not finalize the pending checkpoint " + checkpointId + '.', CheckpointFailureReason.FINALIZE_CHECKPOINT_FAILURE, e1); } // the pending checkpoint must be discarded after the finalization Preconditions.checkState(pendingCheckpoint.isDisposed() && completedCheckpoint != null); try { completedCheckpointStore.addCheckpoint( completedCheckpoint, checkpointsCleaner, this::scheduleTriggerRequest); } catch (Exception exception) { if (exception instanceof PossibleInconsistentStateException) { LOG.warn( "An error occurred while writing checkpoint {} to the underlying metadata store. Flink was not able to determine whether the metadata was successfully persisted. The corresponding state located at '{}' won't be discarded and needs to be cleaned up manually.", completedCheckpoint.getCheckpointID(), completedCheckpoint.getExternalPointer()); } else { // we failed to store the completed checkpoint. Let's clean up checkpointsCleaner.cleanCheckpointOnFailedStoring( completedCheckpoint, executor); } sendAbortedMessages( pendingCheckpoint.getCheckpointPlan().getTasksToCommitTo(), checkpointId, pendingCheckpoint.getCheckpointTimestamp()); throw new CheckpointException( "Could not complete the pending checkpoint " + checkpointId + '.', CheckpointFailureReason.FINALIZE_CHECKPOINT_FAILURE, exception); } } finally { pendingCheckpoints.remove(checkpointId); scheduleTriggerRequest(); } rememberRecentCheckpointId(checkpointId); // drop those pending checkpoints that are at prior to the completed one dropSubsumedCheckpoints(checkpointId); // record the time when this was completed, to calculate // the 'min delay between checkpoints' lastCheckpointCompletionRelativeTime = clock.relativeTimeMillis(); LOG.info( "Completed checkpoint {} for job {} ({} bytes, checkpointDuration={} ms, finalizationTime={} ms).", checkpointId, job, completedCheckpoint.getStateSize(), completedCheckpoint.getCompletionTimestamp() - completedCheckpoint.getTimestamp(), System.currentTimeMillis() - completedCheckpoint.getCompletionTimestamp()); if (LOG.isDebugEnabled()) { StringBuilder builder = new StringBuilder(); builder.append("Checkpoint state: "); for (OperatorState state : completedCheckpoint.getOperatorStates().values()) { builder.append(state); builder.append(", "); } // Remove last two chars ", " builder.setLength(builder.length() - 2); LOG.debug(builder.toString()); } // send the "notify complete" call to all vertices, coordinators, etc. sendAcknowledgeMessages( pendingCheckpoint.getCheckpointPlan().getTasksToCommitTo(), checkpointId, completedCheckpoint.getTimestamp()); } void scheduleTriggerRequest() { synchronized (lock) { if (isShutdown()) { LOG.debug( "Skip scheduling trigger request because the CheckpointCoordinator is shut down"); } else { timer.execute(this::executeQueuedRequest); } } } private void sendAcknowledgeMessages( List tasksToCommit, long checkpointId, long timestamp) { // commit tasks for (ExecutionVertex ev : tasksToCommit) { Execution ee = ev.getCurrentExecutionAttempt(); if (ee != null) { ee.notifyCheckpointComplete(checkpointId, timestamp); } } // commit coordinators for (OperatorCoordinatorCheckpointContext coordinatorContext : coordinatorsToCheckpoint) { coordinatorContext.notifyCheckpointComplete(checkpointId); } } private void sendAbortedMessages( List tasksToAbort, long checkpointId, long timeStamp) { assert (Thread.holdsLock(lock)); long latestCompletedCheckpointId = completedCheckpointStore.getLatestCheckpointId(); // send notification of aborted checkpoints asynchronously. executor.execute( () -> { // send the "abort checkpoint" messages to necessary vertices. for (ExecutionVertex ev : tasksToAbort) { Execution ee = ev.getCurrentExecutionAttempt(); if (ee != null) { ee.notifyCheckpointAborted( checkpointId, latestCompletedCheckpointId, timeStamp); } } }); // commit coordinators for (OperatorCoordinatorCheckpointContext coordinatorContext : coordinatorsToCheckpoint) { coordinatorContext.notifyCheckpointAborted(checkpointId); } } /** * Fails all pending checkpoints which have not been acknowledged by the given execution attempt * id. * * @param executionAttemptId for which to discard unacknowledged pending checkpoints * @param cause of the failure */ public void failUnacknowledgedPendingCheckpointsFor( ExecutionAttemptID executionAttemptId, Throwable cause) { synchronized (lock) { abortPendingCheckpoints( checkpoint -> !checkpoint.isAcknowledgedBy(executionAttemptId), new CheckpointException(CheckpointFailureReason.TASK_FAILURE, cause)); } } private void rememberRecentCheckpointId(long id) { if (recentPendingCheckpoints.size() >= NUM_GHOST_CHECKPOINT_IDS) { recentPendingCheckpoints.removeFirst(); } recentPendingCheckpoints.addLast(id); } private void dropSubsumedCheckpoints(long checkpointId) { abortPendingCheckpoints( checkpoint -> checkpoint.getCheckpointId() < checkpointId && checkpoint.canBeSubsumed(), new CheckpointException(CheckpointFailureReason.CHECKPOINT_SUBSUMED)); } // -------------------------------------------------------------------------------------------- // Checkpoint State Restoring // -------------------------------------------------------------------------------------------- /** * Restores the latest checkpointed state to a set of subtasks. This method represents a "local" * or "regional" failover and does restore states to coordinators. Note that a regional failover * might still include all tasks. * * @param tasks Set of job vertices to restore. State for these vertices is restored via {@link * Execution#setInitialState(JobManagerTaskRestore)}. * @return An {@code OptionalLong} with the checkpoint ID, if state was restored, an empty * {@code OptionalLong} otherwise. * @throws IllegalStateException If the CheckpointCoordinator is shut down. * @throws IllegalStateException If no completed checkpoint is available and the * failIfNoCheckpoint flag has been set. * @throws IllegalStateException If the checkpoint contains state that cannot be mapped to any * job vertex in tasks and the allowNonRestoredState flag has not * been set. * @throws IllegalStateException If the max parallelism changed for an operator that restores * state from this checkpoint. * @throws IllegalStateException If the parallelism changed for an operator that restores * non-partitioned state from this checkpoint. */ public OptionalLong restoreLatestCheckpointedStateToSubtasks( final Set tasks) throws Exception { // when restoring subtasks only we accept potentially unmatched state for the // following reasons // - the set frequently does not include all Job Vertices (only the ones that are part // of the restarted region), meaning there will be unmatched state by design. // - because what we might end up restoring from an original savepoint with unmatched // state, if there is was no checkpoint yet. return restoreLatestCheckpointedStateInternal( tasks, OperatorCoordinatorRestoreBehavior .SKIP, // local/regional recovery does not reset coordinators false, // recovery might come before first successful checkpoint true, false); // see explanation above } /** * Restores the latest checkpointed state to all tasks and all coordinators. This method * represents a "global restore"-style operation where all stateful tasks and coordinators from * the given set of Job Vertices are restored. are restored to their latest checkpointed state. * * @param tasks Set of job vertices to restore. State for these vertices is restored via {@link * Execution#setInitialState(JobManagerTaskRestore)}. * @param allowNonRestoredState Allow checkpoint state that cannot be mapped to any job vertex * in tasks. * @return true if state was restored, false otherwise. * @throws IllegalStateException If the CheckpointCoordinator is shut down. * @throws IllegalStateException If no completed checkpoint is available and the * failIfNoCheckpoint flag has been set. * @throws IllegalStateException If the checkpoint contains state that cannot be mapped to any * job vertex in tasks and the allowNonRestoredState flag has not * been set. * @throws IllegalStateException If the max parallelism changed for an operator that restores * state from this checkpoint. * @throws IllegalStateException If the parallelism changed for an operator that restores * non-partitioned state from this checkpoint. */ public boolean restoreLatestCheckpointedStateToAll( final Set tasks, final boolean allowNonRestoredState) throws Exception { final OptionalLong restoredCheckpointId = restoreLatestCheckpointedStateInternal( tasks, OperatorCoordinatorRestoreBehavior .RESTORE_OR_RESET, // global recovery restores coordinators, or // resets them to empty false, // recovery might come before first successful checkpoint allowNonRestoredState, false); return restoredCheckpointId.isPresent(); } /** * Restores the latest checkpointed at the beginning of the job execution. If there is a * checkpoint, this method acts like a "global restore"-style operation where all stateful tasks * and coordinators from the given set of Job Vertices are restored. * * @param tasks Set of job vertices to restore. State for these vertices is restored via {@link * Execution#setInitialState(JobManagerTaskRestore)}. * @return True, if a checkpoint was found and its state was restored, false otherwise. */ public boolean restoreInitialCheckpointIfPresent(final Set tasks) throws Exception { final OptionalLong restoredCheckpointId = restoreLatestCheckpointedStateInternal( tasks, OperatorCoordinatorRestoreBehavior.RESTORE_IF_CHECKPOINT_PRESENT, false, // initial checkpoints exist only on JobManager failover. ok if not // present. false, true); // JobManager failover means JobGraphs match exactly. return restoredCheckpointId.isPresent(); } /** * Performs the actual restore operation to the given tasks. * *

This method returns the restored checkpoint ID (as an optional) or an empty optional, if * no checkpoint was restored. */ private OptionalLong restoreLatestCheckpointedStateInternal( final Set tasks, final OperatorCoordinatorRestoreBehavior operatorCoordinatorRestoreBehavior, final boolean errorIfNoCheckpoint, final boolean allowNonRestoredState, final boolean checkForPartiallyFinishedOperators) throws Exception { synchronized (lock) { if (shutdown) { throw new IllegalStateException("CheckpointCoordinator is shut down"); } // We create a new shared state registry object, so that all pending async disposal // requests from previous runs will go against the old object (were they can do no // harm). This must happen under the checkpoint lock. sharedStateRegistry.close(); sharedStateRegistry = sharedStateRegistryFactory.create(executor); // Now, we re-register all (shared) states from the checkpoint store with the new // registry for (CompletedCheckpoint completedCheckpoint : completedCheckpointStore.getAllCheckpoints()) { completedCheckpoint.registerSharedStatesAfterRestored(sharedStateRegistry); } LOG.debug( "Status of the shared state registry of job {} after restore: {}.", job, sharedStateRegistry); // Restore from the latest checkpoint CompletedCheckpoint latest = completedCheckpointStore.getLatestCheckpoint(); if (latest == null) { LOG.info("No checkpoint found during restore."); if (errorIfNoCheckpoint) { throw new IllegalStateException("No completed checkpoint available"); } LOG.debug("Resetting the master hooks."); MasterHooks.reset(masterHooks.values(), LOG); if (operatorCoordinatorRestoreBehavior == OperatorCoordinatorRestoreBehavior.RESTORE_OR_RESET) { // we let the JobManager-side components know that there was a recovery, // even if there was no checkpoint to recover from, yet LOG.info("Resetting the Operator Coordinators to an empty state."); restoreStateToCoordinators( OperatorCoordinator.NO_CHECKPOINT, Collections.emptyMap()); } return OptionalLong.empty(); } LOG.info("Restoring job {} from {}.", job, latest); // re-assign the task states final Map operatorStates = extractOperatorStates(latest); if (checkForPartiallyFinishedOperators) { VertexFinishedStateChecker vertexFinishedStateChecker = new VertexFinishedStateChecker(tasks, operatorStates); vertexFinishedStateChecker.validateOperatorsFinishedState(); } StateAssignmentOperation stateAssignmentOperation = new StateAssignmentOperation( latest.getCheckpointID(), tasks, operatorStates, allowNonRestoredState); stateAssignmentOperation.assignStates(); // call master hooks for restore. we currently call them also on "regional restore" // because // there is no other failure notification mechanism in the master hooks // ultimately these should get removed anyways in favor of the operator coordinators MasterHooks.restoreMasterHooks( masterHooks, latest.getMasterHookStates(), latest.getCheckpointID(), allowNonRestoredState, LOG); if (operatorCoordinatorRestoreBehavior != OperatorCoordinatorRestoreBehavior.SKIP) { restoreStateToCoordinators(latest.getCheckpointID(), operatorStates); } // update metrics if (statsTracker != null) { long restoreTimestamp = System.currentTimeMillis(); RestoredCheckpointStats restored = new RestoredCheckpointStats( latest.getCheckpointID(), latest.getProperties(), restoreTimestamp, latest.getExternalPointer()); statsTracker.reportRestoredCheckpoint(restored); } return OptionalLong.of(latest.getCheckpointID()); } } private Map extractOperatorStates(CompletedCheckpoint checkpoint) { Map originalOperatorStates = checkpoint.getOperatorStates(); if (checkpoint.getCheckpointID() != checkpointIdOfIgnoredInFlightData) { // Don't do any changes if it is not required. return originalOperatorStates; } HashMap newStates = new HashMap<>(); // Create the new operator states without in-flight data. for (OperatorState originalOperatorState : originalOperatorStates.values()) { newStates.put( originalOperatorState.getOperatorID(), originalOperatorState.copyAndDiscardInFlightData()); } return newStates; } /** * Restore the state with given savepoint. * * @param savepointPointer The pointer to the savepoint. * @param allowNonRestored True if allowing checkpoint state that cannot be mapped to any job * vertex in tasks. * @param tasks Map of job vertices to restore. State for these vertices is restored via {@link * Execution#setInitialState(JobManagerTaskRestore)}. * @param userClassLoader The class loader to resolve serialized classes in legacy savepoint * versions. */ public boolean restoreSavepoint( String savepointPointer, boolean allowNonRestored, Map tasks, ClassLoader userClassLoader) throws Exception { Preconditions.checkNotNull(savepointPointer, "The savepoint path cannot be null."); LOG.info( "Starting job {} from savepoint {} ({})", job, savepointPointer, (allowNonRestored ? "allowing non restored state" : "")); final CompletedCheckpointStorageLocation checkpointLocation = checkpointStorageView.resolveCheckpoint(savepointPointer); // Load the savepoint as a checkpoint into the system CompletedCheckpoint savepoint = Checkpoints.loadAndValidateCheckpoint( job, tasks, checkpointLocation, userClassLoader, allowNonRestored); completedCheckpointStore.addCheckpoint( savepoint, checkpointsCleaner, this::scheduleTriggerRequest); // Reset the checkpoint ID counter long nextCheckpointId = savepoint.getCheckpointID() + 1; checkpointIdCounter.setCount(nextCheckpointId); LOG.info("Reset the checkpoint ID of job {} to {}.", job, nextCheckpointId); final OptionalLong restoredCheckpointId = restoreLatestCheckpointedStateInternal( new HashSet<>(tasks.values()), OperatorCoordinatorRestoreBehavior.RESTORE_IF_CHECKPOINT_PRESENT, true, allowNonRestored, true); return restoredCheckpointId.isPresent(); } // ------------------------------------------------------------------------ // Accessors // ------------------------------------------------------------------------ public int getNumberOfPendingCheckpoints() { synchronized (lock) { return this.pendingCheckpoints.size(); } } public int getNumberOfRetainedSuccessfulCheckpoints() { synchronized (lock) { return completedCheckpointStore.getNumberOfRetainedCheckpoints(); } } public Map getPendingCheckpoints() { synchronized (lock) { return new HashMap<>(this.pendingCheckpoints); } } public List getSuccessfulCheckpoints() throws Exception { synchronized (lock) { return completedCheckpointStore.getAllCheckpoints(); } } public CheckpointStorageCoordinatorView getCheckpointStorage() { return checkpointStorageView; } public CompletedCheckpointStore getCheckpointStore() { return completedCheckpointStore; } public long getCheckpointTimeout() { return checkpointTimeout; } /** @deprecated use {@link #getNumQueuedRequests()} */ @Deprecated @VisibleForTesting PriorityQueue getTriggerRequestQueue() { synchronized (lock) { return requestDecider.getTriggerRequestQueue(); } } public boolean isTriggering() { return isTriggering; } @VisibleForTesting boolean isCurrentPeriodicTriggerAvailable() { return currentPeriodicTrigger != null; } /** * Returns whether periodic checkpointing has been configured. * * @return true if periodic checkpoints have been configured. */ public boolean isPeriodicCheckpointingConfigured() { return baseInterval != Long.MAX_VALUE; } // -------------------------------------------------------------------------------------------- // Periodic scheduling of checkpoints // -------------------------------------------------------------------------------------------- public void startCheckpointScheduler() { synchronized (lock) { if (shutdown) { throw new IllegalArgumentException("Checkpoint coordinator is shut down"); } Preconditions.checkState( isPeriodicCheckpointingConfigured(), "Can not start checkpoint scheduler, if no periodic checkpointing is configured"); // make sure all prior timers are cancelled stopCheckpointScheduler(); periodicScheduling = true; currentPeriodicTrigger = scheduleTriggerWithDelay(getRandomInitDelay()); } } public void stopCheckpointScheduler() { synchronized (lock) { periodicScheduling = false; cancelPeriodicTrigger(); final CheckpointException reason = new CheckpointException(CheckpointFailureReason.CHECKPOINT_COORDINATOR_SUSPEND); abortPendingAndQueuedCheckpoints(reason); numUnsuccessfulCheckpointsTriggers.set(0); } } public boolean isPeriodicCheckpointingStarted() { return periodicScheduling; } /** * Aborts all the pending checkpoints due to en exception. * * @param exception The exception. */ public void abortPendingCheckpoints(CheckpointException exception) { synchronized (lock) { abortPendingCheckpoints(ignored -> true, exception); } } private void abortPendingCheckpoints( Predicate checkpointToFailPredicate, CheckpointException exception) { assert Thread.holdsLock(lock); final PendingCheckpoint[] pendingCheckpointsToFail = pendingCheckpoints.values().stream() .filter(checkpointToFailPredicate) .toArray(PendingCheckpoint[]::new); // do not traverse pendingCheckpoints directly, because it might be changed during // traversing for (PendingCheckpoint pendingCheckpoint : pendingCheckpointsToFail) { abortPendingCheckpoint(pendingCheckpoint, exception); } } private void rescheduleTrigger(long tillNextMillis) { cancelPeriodicTrigger(); currentPeriodicTrigger = scheduleTriggerWithDelay(tillNextMillis); } private void cancelPeriodicTrigger() { if (currentPeriodicTrigger != null) { currentPeriodicTrigger.cancel(false); currentPeriodicTrigger = null; } } private long getRandomInitDelay() { return ThreadLocalRandom.current().nextLong(minPauseBetweenCheckpoints, baseInterval + 1L); } private ScheduledFuture scheduleTriggerWithDelay(long initDelay) { return timer.scheduleAtFixedRate( new ScheduledTrigger(), initDelay, baseInterval, TimeUnit.MILLISECONDS); } private void restoreStateToCoordinators( final long checkpointId, final Map operatorStates) throws Exception { for (OperatorCoordinatorCheckpointContext coordContext : coordinatorsToCheckpoint) { final OperatorState state = operatorStates.get(coordContext.operatorId()); final ByteStreamStateHandle coordinatorState = state == null ? null : state.getCoordinatorState(); final byte[] bytes = coordinatorState == null ? null : coordinatorState.getData(); coordContext.resetToCheckpoint(checkpointId, bytes); } } // ------------------------------------------------------------------------ // job status listener that schedules / cancels periodic checkpoints // ------------------------------------------------------------------------ public JobStatusListener createActivatorDeactivator() { synchronized (lock) { if (shutdown) { throw new IllegalArgumentException("Checkpoint coordinator is shut down"); } if (jobStatusListener == null) { jobStatusListener = new CheckpointCoordinatorDeActivator(this); } return jobStatusListener; } } int getNumQueuedRequests() { synchronized (lock) { return requestDecider.getNumQueuedRequests(); } } public void reportStats(long id, ExecutionAttemptID attemptId, CheckpointMetrics metrics) throws CheckpointException { if (statsTracker != null) { attemptMappingProvider .getVertex(attemptId) .ifPresent(ev -> statsTracker.reportIncompleteStats(id, ev, metrics)); } } // ------------------------------------------------------------------------ private final class ScheduledTrigger implements Runnable { @Override public void run() { try { triggerCheckpoint(true); } catch (Exception e) { LOG.error("Exception while triggering checkpoint for job {}.", job, e); } } } /** * Discards the given state object asynchronously belonging to the given job, execution attempt * id and checkpoint id. * * @param jobId identifying the job to which the state object belongs * @param executionAttemptID identifying the task to which the state object belongs * @param checkpointId of the state object * @param subtaskState to discard asynchronously */ private void discardSubtaskState( final JobID jobId, final ExecutionAttemptID executionAttemptID, final long checkpointId, final TaskStateSnapshot subtaskState) { if (subtaskState != null) { executor.execute( new Runnable() { @Override public void run() { try { subtaskState.discardState(); } catch (Throwable t2) { LOG.warn( "Could not properly discard state object of checkpoint {} " + "belonging to task {} of job {}.", checkpointId, executionAttemptID, jobId, t2); } } }); } } private void abortPendingCheckpoint( PendingCheckpoint pendingCheckpoint, CheckpointException exception) { abortPendingCheckpoint(pendingCheckpoint, exception, null); } private void abortPendingCheckpoint( PendingCheckpoint pendingCheckpoint, CheckpointException exception, @Nullable final ExecutionAttemptID executionAttemptID) { assert (Thread.holdsLock(lock)); if (!pendingCheckpoint.isDisposed()) { try { // release resource here pendingCheckpoint.abort( exception.getCheckpointFailureReason(), exception.getCause(), checkpointsCleaner, this::scheduleTriggerRequest, executor, getStatsCallback(pendingCheckpoint)); failureManager.handleCheckpointException( pendingCheckpoint, pendingCheckpoint.getProps(), exception, executionAttemptID); } finally { sendAbortedMessages( pendingCheckpoint.getCheckpointPlan().getTasksToCommitTo(), pendingCheckpoint.getCheckpointId(), pendingCheckpoint.getCheckpointTimestamp()); pendingCheckpoints.remove(pendingCheckpoint.getCheckpointId()); rememberRecentCheckpointId(pendingCheckpoint.getCheckpointId()); scheduleTriggerRequest(); } } } private void preCheckGlobalState(boolean isPeriodic) throws CheckpointException { // abort if the coordinator has been shutdown in the meantime if (shutdown) { throw new CheckpointException(CheckpointFailureReason.CHECKPOINT_COORDINATOR_SHUTDOWN); } // Don't allow periodic checkpoint if scheduling has been disabled if (isPeriodic && !periodicScheduling) { throw new CheckpointException(CheckpointFailureReason.PERIODIC_SCHEDULER_SHUTDOWN); } } private void abortPendingAndQueuedCheckpoints(CheckpointException exception) { assert (Thread.holdsLock(lock)); requestDecider.abortAll(exception); abortPendingCheckpoints(exception); } /** * The canceller of checkpoint. The checkpoint might be cancelled if it doesn't finish in a * configured period. */ private class CheckpointCanceller implements Runnable { private final PendingCheckpoint pendingCheckpoint; private CheckpointCanceller(PendingCheckpoint pendingCheckpoint) { this.pendingCheckpoint = checkNotNull(pendingCheckpoint); } @Override public void run() { synchronized (lock) { // only do the work if the checkpoint is not discarded anyways // note that checkpoint completion discards the pending checkpoint object if (!pendingCheckpoint.isDisposed()) { LOG.info( "Checkpoint {} of job {} expired before completing.", pendingCheckpoint.getCheckpointId(), job); abortPendingCheckpoint( pendingCheckpoint, new CheckpointException(CheckpointFailureReason.CHECKPOINT_EXPIRED)); } } } } private static CheckpointException getCheckpointException( CheckpointFailureReason defaultReason, Throwable throwable) { final Optional ioExceptionOptional = findThrowable(throwable, IOException.class); if (ioExceptionOptional.isPresent()) { return new CheckpointException(CheckpointFailureReason.IO_EXCEPTION, throwable); } else { final Optional checkpointExceptionOptional = findThrowable(throwable, CheckpointException.class); return checkpointExceptionOptional.orElseGet( () -> new CheckpointException(defaultReason, throwable)); } } private static class CheckpointIdAndStorageLocation { private final long checkpointId; private final CheckpointStorageLocation checkpointStorageLocation; CheckpointIdAndStorageLocation( long checkpointId, CheckpointStorageLocation checkpointStorageLocation) { this.checkpointId = checkpointId; this.checkpointStorageLocation = checkNotNull(checkpointStorageLocation); } } static class CheckpointTriggerRequest { final long timestamp; final CheckpointProperties props; final @Nullable String externalSavepointLocation; final boolean isPeriodic; private final CompletableFuture onCompletionPromise = new CompletableFuture<>(); CheckpointTriggerRequest( CheckpointProperties props, @Nullable String externalSavepointLocation, boolean isPeriodic) { this.timestamp = System.currentTimeMillis(); this.props = checkNotNull(props); this.externalSavepointLocation = externalSavepointLocation; this.isPeriodic = isPeriodic; } CompletableFuture getOnCompletionFuture() { return onCompletionPromise; } public void completeExceptionally(CheckpointException exception) { onCompletionPromise.completeExceptionally(exception); } public boolean isForce() { return props.forceCheckpoint(); } } private enum OperatorCoordinatorRestoreBehavior { /** Coordinators are always restored. If there is no checkpoint, they are restored empty. */ RESTORE_OR_RESET, /** Coordinators are restored if there was a checkpoint. */ RESTORE_IF_CHECKPOINT_PRESENT, /** Coordinators are not restored during this checkpoint restore. */ SKIP; } private void trackPendingCheckpointStats(PendingCheckpoint checkpoint) { if (statsTracker == null) { return; } Map vertices = Stream.concat( checkpoint.getCheckpointPlan().getTasksToWaitFor().stream(), checkpoint.getCheckpointPlan().getFinishedTasks().stream()) .map(Execution::getVertex) .map(ExecutionVertex::getJobVertex) .distinct() .collect( toMap( ExecutionJobVertex::getJobVertexId, ExecutionJobVertex::getParallelism)); PendingCheckpointStats pendingCheckpointStats = statsTracker.reportPendingCheckpoint( checkpoint.getCheckpointID(), checkpoint.getCheckpointTimestamp(), checkpoint.getProps(), vertices); reportFinishedTasks( pendingCheckpointStats, checkpoint.getCheckpointPlan().getFinishedTasks()); } private void reportFinishedTasks( PendingCheckpointStats pendingCheckpointStats, List finishedTasks) { long now = System.currentTimeMillis(); finishedTasks.forEach( execution -> pendingCheckpointStats.reportSubtaskStats( execution.getVertex().getJobvertexId(), new SubtaskStateStats(execution.getParallelSubtaskIndex(), now))); } @Nullable private PendingCheckpointStats getStatsCallback(PendingCheckpoint pendingCheckpoint) { return statsTracker == null ? null : statsTracker.getPendingCheckpointStats(pendingCheckpoint.getCheckpointID()); } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.14/org/apache/flink/runtime/util/EnvironmentInformation.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.runtime.util; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.configuration.GlobalConfiguration; import org.apache.flink.util.OperatingSystem; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.InputStream; import java.lang.management.ManagementFactory; import java.lang.management.RuntimeMXBean; import java.lang.reflect.Method; import java.time.Instant; import java.time.ZoneId; import java.time.format.DateTimeFormatter; import java.time.format.DateTimeParseException; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.concurrent.ConcurrentHashMap; /** * Utility class that gives access to the execution environment of the JVM, like the executing user, * startup options, or the JVM version. */ public class EnvironmentInformation { @VisibleForTesting public static final String UNKNOWN_COMMIT_ID = "DecafC0ffeeD0d0F00d"; @VisibleForTesting public static final String UNKNOWN_COMMIT_ID_ABBREV = "DeadD0d0"; private static final Logger LOG = LoggerFactory.getLogger(EnvironmentInformation.class); public static final String UNKNOWN = ""; /** * Returns the version of the code as String. * * @return The project version string. */ public static String getVersion() { return getVersionsInstance().projectVersion; } /** * Returns the version of the used Scala compiler as String. * * @return The scala version string. */ public static String getScalaVersion() { return getVersionsInstance().scalaVersion; } /** @return The Instant this version of the software was built. */ public static Instant getBuildTime() { return getVersionsInstance().gitBuildTime; } /** * @return The Instant this version of the software was built as a String using the * Europe/Berlin timezone. */ public static String getBuildTimeString() { return getVersionsInstance().gitBuildTimeStr; } /** @return The last known commit id of this version of the software. */ public static String getGitCommitId() { return getVersionsInstance().gitCommitId; } /** @return The last known abbreviated commit id of this version of the software. */ public static String getGitCommitIdAbbrev() { return getVersionsInstance().gitCommitIdAbbrev; } /** @return The Instant of the last commit of this code. */ public static Instant getGitCommitTime() { return getVersionsInstance().gitCommitTime; } /** * @return The Instant of the last commit of this code as a String using the Europe/Berlin * timezone. */ public static String getGitCommitTimeString() { return getVersionsInstance().gitCommitTimeStr; } /** * Returns the code revision (commit and commit date) of Flink, as generated by the Maven * builds. * * @return The code revision. */ public static RevisionInformation getRevisionInformation() { return new RevisionInformation(getGitCommitIdAbbrev(), getGitCommitTimeString()); } private static final class Versions { private static final Instant DEFAULT_TIME_INSTANT = Instant.EPOCH; private static final String DEFAULT_TIME_STRING = "1970-01-01T00:00:00+0000"; private String projectVersion = UNKNOWN; private String scalaVersion = UNKNOWN; private Instant gitBuildTime = DEFAULT_TIME_INSTANT; private String gitBuildTimeStr = DEFAULT_TIME_STRING; private String gitCommitId = UNKNOWN_COMMIT_ID; private String gitCommitIdAbbrev = UNKNOWN_COMMIT_ID_ABBREV; private Instant gitCommitTime = DEFAULT_TIME_INSTANT; private String gitCommitTimeStr = DEFAULT_TIME_STRING; private static final String PROP_FILE = ".flink-runtime.version.properties"; private static final String FAIL_MESSAGE = "The file " + PROP_FILE + " has not been generated correctly. You MUST run 'mvn generate-sources' in the flink-runtime module."; private String getProperty(Properties properties, String key, String defaultValue) { String value = properties.getProperty(key); if (value == null || value.charAt(0) == '$') { return defaultValue; } return value; } public Versions() { ClassLoader classLoader = EnvironmentInformation.class.getClassLoader(); try (InputStream propFile = classLoader.getResourceAsStream(PROP_FILE)) { if (propFile != null) { Properties properties = new Properties(); properties.load(propFile); projectVersion = getProperty(properties, "project.version", UNKNOWN); scalaVersion = getProperty(properties, "scala.binary.version", UNKNOWN); gitCommitId = getProperty(properties, "git.commit.id", UNKNOWN_COMMIT_ID); gitCommitIdAbbrev = getProperty( properties, "git.commit.id.abbrev", UNKNOWN_COMMIT_ID_ABBREV); // This is to reliably parse the datetime format configured in the // git-commit-id-plugin DateTimeFormatter gitDateTimeFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ssZ"); // Default format is in Berlin timezone because that is where Flink originated. DateTimeFormatter berlinDateTime = DateTimeFormatter.ISO_OFFSET_DATE_TIME.withZone( ZoneId.of("Europe/Berlin")); try { String propGitCommitTime = getProperty(properties, "git.commit.time", DEFAULT_TIME_STRING); gitCommitTime = gitDateTimeFormatter.parse(propGitCommitTime, Instant::from); gitCommitTimeStr = berlinDateTime.format(gitCommitTime); String propGitBuildTime = getProperty(properties, "git.build.time", DEFAULT_TIME_STRING); gitBuildTime = gitDateTimeFormatter.parse(propGitBuildTime, Instant::from); gitBuildTimeStr = berlinDateTime.format(gitBuildTime); } catch (DateTimeParseException dtpe) { LOG.error("{} : {}", FAIL_MESSAGE, dtpe); throw new IllegalStateException(FAIL_MESSAGE); } } } catch (IOException ioe) { LOG.info( "Cannot determine code revision: Unable to read version property file.: {}", ioe.getMessage()); } } } private static final class VersionsHolder { static final Versions INSTANCE = new Versions(); } private static Versions getVersionsInstance() { return VersionsHolder.INSTANCE; } /** * Gets the name of the user that is running the JVM. * * @return The name of the user that is running the JVM. */ public static String getHadoopUser() { try { Class ugiClass = Class.forName( "org.apache.hadoop.security.UserGroupInformation", false, EnvironmentInformation.class.getClassLoader()); Method currentUserMethod = ugiClass.getMethod("getCurrentUser"); Method shortUserNameMethod = ugiClass.getMethod("getShortUserName"); Object ugi = currentUserMethod.invoke(null); return (String) shortUserNameMethod.invoke(ugi); } catch (ClassNotFoundException e) { return ""; } catch (LinkageError e) { // hadoop classes are not in the classpath LOG.debug( "Cannot determine user/group information using Hadoop utils. " + "Hadoop classes not loaded or compatible", e); } catch (Throwable t) { // some other error occurred that we should log and make known LOG.warn("Error while accessing user/group information via Hadoop utils.", t); } return UNKNOWN; } /** * The maximum JVM heap size, in bytes. * *

This method uses the -Xmx value of the JVM, if set. If not set, it returns (as a * heuristic) 1/4th of the physical memory size. * * @return The maximum JVM heap size, in bytes. */ public static long getMaxJvmHeapMemory() { final long maxMemory = Runtime.getRuntime().maxMemory(); if (maxMemory != Long.MAX_VALUE) { // we have the proper max memory return maxMemory; } else { // max JVM heap size is not set - use the heuristic to use 1/4th of the physical memory final long physicalMemory = Hardware.getSizeOfPhysicalMemory(); if (physicalMemory != -1) { // got proper value for physical memory return physicalMemory / 4; } else { throw new RuntimeException( "Could not determine the amount of free memory.\n" + "Please set the maximum memory for the JVM, e.g. -Xmx512M for 512 megabytes."); } } } /** * Gets an estimate of the size of the free heap memory. * *

NOTE: This method is heavy-weight. It triggers a garbage collection to reduce * fragmentation and get a better estimate at the size of free memory. It is typically more * accurate than the plain version {@link #getSizeOfFreeHeapMemory()}. * * @return An estimate of the size of the free heap memory, in bytes. */ public static long getSizeOfFreeHeapMemoryWithDefrag() { // trigger a garbage collection, to reduce fragmentation System.gc(); return getSizeOfFreeHeapMemory(); } /** * Gets an estimate of the size of the free heap memory. The estimate may vary, depending on the * current level of memory fragmentation and the number of dead objects. For a better (but more * heavy-weight) estimate, use {@link #getSizeOfFreeHeapMemoryWithDefrag()}. * * @return An estimate of the size of the free heap memory, in bytes. */ public static long getSizeOfFreeHeapMemory() { Runtime r = Runtime.getRuntime(); return getMaxJvmHeapMemory() - r.totalMemory() + r.freeMemory(); } /** * Gets the version of the JVM in the form "VM_Name - Vendor - Spec/Version". * * @return The JVM version. */ public static String getJvmVersion() { try { final RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); return bean.getVmName() + " - " + bean.getVmVendor() + " - " + bean.getSpecVersion() + '/' + bean.getVmVersion(); } catch (Throwable t) { return UNKNOWN; } } /** * Gets the system parameters and environment parameters that were passed to the JVM on startup. * * @return The options passed to the JVM on startup. */ public static String getJvmStartupOptions() { try { final RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); final StringBuilder bld = new StringBuilder(); for (String s : bean.getInputArguments()) { bld.append(s).append(' '); } return bld.toString(); } catch (Throwable t) { return UNKNOWN; } } /** * Gets the system parameters and environment parameters that were passed to the JVM on startup. * * @return The options passed to the JVM on startup. */ public static String[] getJvmStartupOptionsArray() { try { RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); List options = bean.getInputArguments(); return options.toArray(new String[options.size()]); } catch (Throwable t) { return new String[0]; } } /** * Gets the directory for temporary files, as returned by the JVM system property * "java.io.tmpdir". * * @return The directory for temporary files. */ public static String getTemporaryFileDirectory() { return System.getProperty("java.io.tmpdir"); } /** * Tries to retrieve the maximum number of open file handles. This method will only work on * UNIX-based operating systems with Sun/Oracle Java versions. * *

If the number of max open file handles cannot be determined, this method returns {@code * -1}. * * @return The limit of open file handles, or {@code -1}, if the limit could not be determined. */ public static long getOpenFileHandlesLimit() { if (OperatingSystem .isWindows()) { // getMaxFileDescriptorCount method is not available on Windows return -1L; } Class sunBeanClass; try { sunBeanClass = Class.forName("com.sun.management.UnixOperatingSystemMXBean"); } catch (ClassNotFoundException e) { return -1L; } try { Method fhLimitMethod = sunBeanClass.getMethod("getMaxFileDescriptorCount"); Object result = fhLimitMethod.invoke(ManagementFactory.getOperatingSystemMXBean()); return (Long) result; } catch (Throwable t) { LOG.warn("Unexpected error when accessing file handle limit", t); return -1L; } } // TODO: ------------ start:二次开发代码 ---------------- // // 用于判断是否为JobManager private static Boolean IS_JOBMANAGER = true; private static final Map settings = new ConcurrentHashMap<>(); /** * 用不判断当前组件是否为JobManager */ public static boolean isJobManager() { return IS_JOBMANAGER; } /** * 获取配置信息 */ public static Map getSettings() { return settings; } /** * 设置配置信息 */ public static void setSetting(String key, String value) { if (!settings.containsKey(key)) settings.put(key, value); } /** * 解析命令并判断是否为JobManager */ private static void parseCommand(String[] commandLineArgs) { if (commandLineArgs != null) { for (String command : commandLineArgs) { if (command != null && command.length() > 0) { if (command.contains("resource-id")) { IS_JOBMANAGER = false; } if (!"-D".equals(command)) { String[] properties = command.replace("-D", "").split("=", 2); if (properties != null && properties.length == 2 && properties[0] != null && properties[1] != null) { settings.put(properties[0], properties[1]); } } } } } } // TODO: ------------ end:二次开发代码 ----------------- // /** * Logs information about the environment, like code revision, current user, Java version, and * JVM parameters. * * @param log The logger to log the information to. * @param componentName The component name to mention in the log. * @param commandLineArgs The arguments accompanying the starting the component. */ public static void logEnvironmentInfo( Logger log, String componentName, String[] commandLineArgs) { // TODO: ------------ start:二次开发代码 --------------- // parseCommand(commandLineArgs); // TODO: ------------ end:二次开发代码 ---------------- // if (log.isInfoEnabled()) { RevisionInformation rev = getRevisionInformation(); String version = getVersion(); String scalaVersion = getScalaVersion(); String jvmVersion = getJvmVersion(); String[] options = getJvmStartupOptionsArray(); String javaHome = System.getenv("JAVA_HOME"); String inheritedLogs = System.getenv("FLINK_INHERITED_LOGS"); long maxHeapMegabytes = getMaxJvmHeapMemory() >>> 20; if (inheritedLogs != null) { log.info( "--------------------------------------------------------------------------------"); log.info(" Preconfiguration: "); log.info(inheritedLogs); } log.info( "--------------------------------------------------------------------------------"); log.info( " Starting " + componentName + " (Version: " + version + ", Scala: " + scalaVersion + ", " + "Rev:" + rev.commitId + ", " + "Date:" + rev.commitDate + ")"); log.info(" OS current user: " + System.getProperty("user.name")); log.info(" Current Hadoop/Kerberos user: " + getHadoopUser()); log.info(" JVM: " + jvmVersion); log.info(" Maximum heap size: " + maxHeapMegabytes + " MiBytes"); log.info(" JAVA_HOME: " + (javaHome == null ? "(not set)" : javaHome)); String hadoopVersionString = getHadoopVersionString(); if (hadoopVersionString != null) { log.info(" Hadoop version: " + hadoopVersionString); } else { log.info(" No Hadoop Dependency available"); } if (options.length == 0) { log.info(" JVM Options: (none)"); } else { log.info(" JVM Options:"); for (String s : options) { log.info(" " + s); } } if (commandLineArgs == null || commandLineArgs.length == 0) { log.info(" Program Arguments: (none)"); } else { log.info(" Program Arguments:"); for (String s : commandLineArgs) { if (GlobalConfiguration.isSensitive(s)) { log.info( " " + GlobalConfiguration.HIDDEN_CONTENT + " (sensitive information)"); } else { log.info(" " + s); } } } log.info(" Classpath: " + System.getProperty("java.class.path")); log.info( "--------------------------------------------------------------------------------"); } } public static String getHadoopVersionString() { try { Class versionInfoClass = Class.forName( "org.apache.hadoop.util.VersionInfo", false, EnvironmentInformation.class.getClassLoader()); Method method = versionInfoClass.getMethod("getVersion"); return (String) method.invoke(null); } catch (ClassNotFoundException | NoSuchMethodException e) { return null; } catch (Throwable e) { LOG.error("Cannot invoke VersionInfo.getVersion reflectively.", e); return null; } } // -------------------------------------------------------------------------------------------- /** Don't instantiate this class */ private EnvironmentInformation() {} // -------------------------------------------------------------------------------------------- /** * Revision information encapsulates information about the source code revision of the Flink * code. */ public static class RevisionInformation { /** The git commit id (hash) */ public final String commitId; /** The git commit date */ public final String commitDate; public RevisionInformation(String commitId, String commitDate) { this.commitId = commitId; this.commitDate = commitDate; } } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.14/org/apache/flink/streaming/connectors/kafka/FlinkKafkaConsumer.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.streaming.connectors.kafka; import org.apache.flink.annotation.PublicEvolving; import org.apache.flink.api.common.eventtime.WatermarkStrategy; import org.apache.flink.api.common.serialization.DeserializationSchema; import org.apache.flink.configuration.Configuration; import org.apache.flink.metrics.MetricGroup; import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; import org.apache.flink.streaming.connectors.kafka.config.OffsetCommitMode; import org.apache.flink.streaming.connectors.kafka.internals.AbstractFetcher; import org.apache.flink.streaming.connectors.kafka.internals.AbstractPartitionDiscoverer; import org.apache.flink.streaming.connectors.kafka.internals.KafkaDeserializationSchemaWrapper; import org.apache.flink.streaming.connectors.kafka.internals.KafkaFetcher; import org.apache.flink.streaming.connectors.kafka.internals.KafkaPartitionDiscoverer; import org.apache.flink.streaming.connectors.kafka.internals.KafkaTopicPartition; import org.apache.flink.streaming.connectors.kafka.internals.KafkaTopicsDescriptor; import org.apache.flink.util.PropertiesUtil; import org.apache.flink.util.SerializedValue; import org.apache.kafka.clients.consumer.ConsumerConfig; import org.apache.kafka.clients.consumer.KafkaConsumer; import org.apache.kafka.clients.consumer.OffsetAndTimestamp; import org.apache.kafka.common.TopicPartition; import org.apache.kafka.common.serialization.ByteArrayDeserializer; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.regex.Pattern; import static org.apache.flink.util.Preconditions.checkNotNull; import static org.apache.flink.util.PropertiesUtil.getBoolean; import static org.apache.flink.util.PropertiesUtil.getLong; /** * The Flink Kafka Consumer is a streaming data source that pulls a parallel data stream from Apache * Kafka. The consumer can run in multiple parallel instances, each of which will pull data from one * or more Kafka partitions. * *

The Flink Kafka Consumer participates in checkpointing and guarantees that no data is lost * during a failure, and that the computation processes elements "exactly once". (Note: These * guarantees naturally assume that Kafka itself does not loose any data.) * *

Please note that Flink snapshots the offsets internally as part of its distributed * checkpoints. The offsets committed to Kafka are only to bring the outside view of progress in * sync with Flink's view of the progress. That way, monitoring and other jobs can get a view of how * far the Flink Kafka consumer has consumed a topic. * *

Please refer to Kafka's documentation for the available configuration properties: * http://kafka.apache.org/documentation.html#newconsumerconfigs */ @PublicEvolving @Deprecated public class FlinkKafkaConsumer extends FlinkKafkaConsumerBase { private static final long serialVersionUID = 1L; /** Configuration key to change the polling timeout. * */ public static final String KEY_POLL_TIMEOUT = "flink.poll-timeout"; /** * From Kafka's Javadoc: The time, in milliseconds, spent waiting in poll if data is not * available. If 0, returns immediately with any records that are available now. */ public static final long DEFAULT_POLL_TIMEOUT = 100L; // ------------------------------------------------------------------------ /** User-supplied properties for Kafka. * */ protected final Properties properties; /** * From Kafka's Javadoc: The time, in milliseconds, spent waiting in poll if data is not * available. If 0, returns immediately with any records that are available now */ protected final long pollTimeout; // ------------------------------------------------------------------------ /** * Creates a new Kafka streaming source consumer. * * @param topic The name of the topic that should be consumed. * @param valueDeserializer The de-/serializer used to convert between Kafka's byte messages and * Flink's objects. * @param props */ public FlinkKafkaConsumer( String topic, DeserializationSchema valueDeserializer, Properties props) { this(Collections.singletonList(topic), valueDeserializer, props); } /** * Creates a new Kafka streaming source consumer. * *

This constructor allows passing a {@see KafkaDeserializationSchema} for reading key/value * pairs, offsets, and topic names from Kafka. * * @param topic The name of the topic that should be consumed. * @param deserializer The keyed de-/serializer used to convert between Kafka's byte messages * and Flink's objects. * @param props */ public FlinkKafkaConsumer( String topic, KafkaDeserializationSchema deserializer, Properties props) { this(Collections.singletonList(topic), deserializer, props); } /** * Creates a new Kafka streaming source consumer. * *

This constructor allows passing multiple topics to the consumer. * * @param topics The Kafka topics to read from. * @param deserializer The de-/serializer used to convert between Kafka's byte messages and * Flink's objects. * @param props */ public FlinkKafkaConsumer( List topics, DeserializationSchema deserializer, Properties props) { this(topics, new KafkaDeserializationSchemaWrapper<>(deserializer), props); } /** * Creates a new Kafka streaming source consumer. * *

This constructor allows passing multiple topics and a key/value deserialization schema. * * @param topics The Kafka topics to read from. * @param deserializer The keyed de-/serializer used to convert between Kafka's byte messages * and Flink's objects. * @param props */ public FlinkKafkaConsumer( List topics, KafkaDeserializationSchema deserializer, Properties props) { this(topics, null, deserializer, props); } /** * Creates a new Kafka streaming source consumer. Use this constructor to subscribe to multiple * topics based on a regular expression pattern. * *

If partition discovery is enabled (by setting a non-negative value for {@link * FlinkKafkaConsumer#KEY_PARTITION_DISCOVERY_INTERVAL_MILLIS} in the properties), topics with * names matching the pattern will also be subscribed to as they are created on the fly. * * @param subscriptionPattern The regular expression for a pattern of topic names to subscribe * to. * @param valueDeserializer The de-/serializer used to convert between Kafka's byte messages and * Flink's objects. * @param props */ public FlinkKafkaConsumer( Pattern subscriptionPattern, DeserializationSchema valueDeserializer, Properties props) { this( null, subscriptionPattern, new KafkaDeserializationSchemaWrapper<>(valueDeserializer), props); } /** * Creates a new Kafka streaming source consumer. Use this constructor to subscribe to multiple * topics based on a regular expression pattern. * *

If partition discovery is enabled (by setting a non-negative value for {@link * FlinkKafkaConsumer#KEY_PARTITION_DISCOVERY_INTERVAL_MILLIS} in the properties), topics with * names matching the pattern will also be subscribed to as they are created on the fly. * *

This constructor allows passing a {@see KafkaDeserializationSchema} for reading key/value * pairs, offsets, and topic names from Kafka. * * @param subscriptionPattern The regular expression for a pattern of topic names to subscribe * to. * @param deserializer The keyed de-/serializer used to convert between Kafka's byte messages * and Flink's objects. * @param props */ public FlinkKafkaConsumer( Pattern subscriptionPattern, KafkaDeserializationSchema deserializer, Properties props) { this(null, subscriptionPattern, deserializer, props); } private FlinkKafkaConsumer( List topics, Pattern subscriptionPattern, KafkaDeserializationSchema deserializer, Properties props) { super( topics, subscriptionPattern, deserializer, getLong( checkNotNull(props, "props"), KEY_PARTITION_DISCOVERY_INTERVAL_MILLIS, PARTITION_DISCOVERY_DISABLED), !getBoolean(props, KEY_DISABLE_METRICS, false)); this.properties = props; setDeserializer(this.properties); // configure the polling timeout try { if (properties.containsKey(KEY_POLL_TIMEOUT)) { this.pollTimeout = Long.parseLong(properties.getProperty(KEY_POLL_TIMEOUT)); } else { this.pollTimeout = DEFAULT_POLL_TIMEOUT; } } catch (Exception e) { throw new IllegalArgumentException( "Cannot parse poll timeout for '" + KEY_POLL_TIMEOUT + '\'', e); } } @Override protected AbstractFetcher createFetcher( SourceContext sourceContext, Map assignedPartitionsWithInitialOffsets, SerializedValue> watermarkStrategy, StreamingRuntimeContext runtimeContext, OffsetCommitMode offsetCommitMode, MetricGroup consumerMetricGroup, boolean useMetrics) throws Exception { // make sure that auto commit is disabled when our offset commit mode is ON_CHECKPOINTS; // this overwrites whatever setting the user configured in the properties adjustAutoCommitConfig(properties, offsetCommitMode); return new KafkaFetcher<>( sourceContext, assignedPartitionsWithInitialOffsets, watermarkStrategy, runtimeContext.getProcessingTimeService(), runtimeContext.getExecutionConfig().getAutoWatermarkInterval(), runtimeContext.getUserCodeClassLoader(), runtimeContext.getTaskNameWithSubtasks(), deserializer, properties, pollTimeout, runtimeContext.getMetricGroup(), consumerMetricGroup, useMetrics); } @Override protected AbstractPartitionDiscoverer createPartitionDiscoverer( KafkaTopicsDescriptor topicsDescriptor, int indexOfThisSubtask, int numParallelSubtasks) { return new KafkaPartitionDiscoverer( topicsDescriptor, indexOfThisSubtask, numParallelSubtasks, properties); } @Override protected Map fetchOffsetsWithTimestamp( Collection partitions, long timestamp) { Map partitionOffsetsRequest = new HashMap<>(partitions.size()); for (KafkaTopicPartition partition : partitions) { partitionOffsetsRequest.put( new TopicPartition(partition.getTopic(), partition.getPartition()), timestamp); } final Map result = new HashMap<>(partitions.size()); // use a short-lived consumer to fetch the offsets; // this is ok because this is a one-time operation that happens only on startup try (KafkaConsumer consumer = new KafkaConsumer(properties)) { for (Map.Entry partitionToOffset : consumer.offsetsForTimes(partitionOffsetsRequest).entrySet()) { result.put( new KafkaTopicPartition( partitionToOffset.getKey().topic(), partitionToOffset.getKey().partition()), (partitionToOffset.getValue() == null) ? null : partitionToOffset.getValue().offset()); } } return result; } @Override protected boolean getIsAutoCommitEnabled() { return getBoolean(properties, ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, true) && PropertiesUtil.getLong( properties, ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, 5000) > 0; } /** * Makes sure that the ByteArrayDeserializer is registered in the Kafka properties. * * @param props The Kafka properties to register the serializer in. */ private static void setDeserializer(Properties props) { final String deSerName = ByteArrayDeserializer.class.getName(); Object keyDeSer = props.get(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG); Object valDeSer = props.get(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG); if (keyDeSer != null && !keyDeSer.equals(deSerName)) { LOG.warn( "Ignoring configured key DeSerializer ({})", ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG); } if (valDeSer != null && !valDeSer.equals(deSerName)) { LOG.warn( "Ignoring configured value DeSerializer ({})", ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG); } props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, deSerName); props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, deSerName); } // TODO: ------------ start:二次开发代码 ----------------- // @Override public void open(Configuration configuration) throws Exception { try { String topics = this.properties.getProperty("kafka.topics", ""); String groupId = this.properties.getProperty("group.id", ""); // 是否开启周期性的offset提交,仅在开启checkpoint的情况下生效 this.enableForceAutoCommit = Boolean.parseBoolean(this.properties.getProperty("kafka.force.autoCommit.enable", "false")); // 自动提交offset的周期 this.forceAutoCommitIntervalMillis = Long.parseLong(this.properties.getProperty("kafka.force.autoCommit.Interval", "30000")); if (this.enableForceAutoCommit) { this.executeAutoCommit(topics, groupId); LOG.info("开启异步提交kafka offset功能,topics:{} groupId:{} interval:{}", topics, groupId, this.forceAutoCommitIntervalMillis); } // 判断是否跳过从状态中读取到的offset信息 boolean skipRestoredState = Boolean.parseBoolean(this.properties.getProperty("kafka.force.overwrite.stateOffset.enable", "false")); if (skipRestoredState && this.restoredState != null) { // 将状态中的offseet置为null以后,将在super.open方法中取消从状态seek offset this.restoredState = null; LOG.info("将忽略状态中的offset信息,使用默认策略消费kafka消息!topics:{} groupId:{}", topics, groupId); } } catch (Exception e) { LOG.error("强制覆盖状态中的offset或周期性提交offset功能开启失败", e); } finally { super.open(configuration); } } /** * 周期性提交kafka offset. * @param topics * kafka的topic列表 * @param groupId * 为该groupId执行异步的offset提交 */ private void executeAutoCommit(String topics, String groupId) { ExecutorService service = Executors.newSingleThreadExecutor(); service.execute(new Runnable() { @Override public void run() { try { long threadId = Thread.currentThread().getId(); while (true) { if (running && kafkaFetcher != null && (offsetCommitMode == OffsetCommitMode.ON_CHECKPOINTS)) { HashMap currentOffsets = kafkaFetcher.snapshotCurrentState(); kafkaFetcher.commitInternalOffsetsToKafka(currentOffsets, offsetCommitCallback); LOG.warn("周期性自动提交kafka offset成功!topics:{} groupId:{} ThreadId:{}", topics, groupId, threadId); } Thread.sleep(forceAutoCommitIntervalMillis); } } catch (Exception e) { LOG.error("周期性自动提交offset定时任务执行出错", e); } } }); } // TODO: ------------ end:二次开发代码 ----------------- // } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.14/org/apache/flink/streaming/connectors/kafka/FlinkKafkaConsumerBase.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.streaming.connectors.kafka; import org.apache.flink.annotation.Internal; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.ExecutionConfig; import org.apache.flink.api.common.eventtime.WatermarkStrategy; import org.apache.flink.api.common.serialization.RuntimeContextInitializationContextAdapters; import org.apache.flink.api.common.state.CheckpointListener; import org.apache.flink.api.common.state.ListState; import org.apache.flink.api.common.state.ListStateDescriptor; import org.apache.flink.api.common.state.OperatorStateStore; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.common.typeutils.TypeSerializer; import org.apache.flink.api.common.typeutils.base.LongSerializer; import org.apache.flink.api.java.ClosureCleaner; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.typeutils.ResultTypeQueryable; import org.apache.flink.api.java.typeutils.runtime.TupleSerializer; import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer; import org.apache.flink.configuration.Configuration; import org.apache.flink.metrics.Counter; import org.apache.flink.metrics.MetricGroup; import org.apache.flink.runtime.state.FunctionInitializationContext; import org.apache.flink.runtime.state.FunctionSnapshotContext; import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks; import org.apache.flink.streaming.api.functions.AssignerWithPunctuatedWatermarks; import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; import org.apache.flink.streaming.api.operators.StreamingRuntimeContext; import org.apache.flink.streaming.connectors.kafka.config.OffsetCommitMode; import org.apache.flink.streaming.connectors.kafka.config.OffsetCommitModes; import org.apache.flink.streaming.connectors.kafka.config.StartupMode; import org.apache.flink.streaming.connectors.kafka.internals.AbstractFetcher; import org.apache.flink.streaming.connectors.kafka.internals.AbstractPartitionDiscoverer; import org.apache.flink.streaming.connectors.kafka.internals.KafkaCommitCallback; import org.apache.flink.streaming.connectors.kafka.internals.KafkaTopicPartition; import org.apache.flink.streaming.connectors.kafka.internals.KafkaTopicPartitionAssigner; import org.apache.flink.streaming.connectors.kafka.internals.KafkaTopicPartitionStateSentinel; import org.apache.flink.streaming.connectors.kafka.internals.KafkaTopicsDescriptor; import org.apache.flink.streaming.runtime.operators.util.AssignerWithPeriodicWatermarksAdapter; import org.apache.flink.streaming.runtime.operators.util.AssignerWithPunctuatedWatermarksAdapter; import org.apache.flink.util.ExceptionUtils; import org.apache.flink.util.SerializedValue; import org.apache.commons.collections.map.LinkedMap; import org.apache.kafka.clients.consumer.ConsumerConfig; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.TreeMap; import java.util.concurrent.atomic.AtomicReference; import java.util.regex.Pattern; import static org.apache.flink.streaming.connectors.kafka.internals.metrics.KafkaConsumerMetricConstants.COMMITS_FAILED_METRICS_COUNTER; import static org.apache.flink.streaming.connectors.kafka.internals.metrics.KafkaConsumerMetricConstants.COMMITS_SUCCEEDED_METRICS_COUNTER; import static org.apache.flink.streaming.connectors.kafka.internals.metrics.KafkaConsumerMetricConstants.KAFKA_CONSUMER_METRICS_GROUP; import static org.apache.flink.util.Preconditions.checkArgument; import static org.apache.flink.util.Preconditions.checkNotNull; /** * Base class of all Flink Kafka Consumer data sources. This implements the common behavior across * all Kafka versions. * *

The Kafka version specific behavior is defined mainly in the specific subclasses of the {@link * AbstractFetcher}. * * @param The type of records produced by this data source */ @Internal public abstract class FlinkKafkaConsumerBase extends RichParallelSourceFunction implements CheckpointListener, ResultTypeQueryable, CheckpointedFunction { private static final long serialVersionUID = -6272159445203409112L; protected static final Logger LOG = LoggerFactory.getLogger(FlinkKafkaConsumerBase.class); /** The maximum number of pending non-committed checkpoints to track, to avoid memory leaks. */ public static final int MAX_NUM_PENDING_CHECKPOINTS = 100; /** * The default interval to execute partition discovery, in milliseconds ({@code Long.MIN_VALUE}, * i.e. disabled by default). */ public static final long PARTITION_DISCOVERY_DISABLED = Long.MIN_VALUE; /** Boolean configuration key to disable metrics tracking. * */ public static final String KEY_DISABLE_METRICS = "flink.disable-metrics"; /** Configuration key to define the consumer's partition discovery interval, in milliseconds. */ public static final String KEY_PARTITION_DISCOVERY_INTERVAL_MILLIS = "flink.partition-discovery.interval-millis"; /** State name of the consumer's partition offset states. */ private static final String OFFSETS_STATE_NAME = "topic-partition-offset-states"; // ------------------------------------------------------------------------ // configuration state, set on the client relevant for all subtasks // ------------------------------------------------------------------------ /** Describes whether we are discovering partitions for fixed topics or a topic pattern. */ private final KafkaTopicsDescriptor topicsDescriptor; /** The schema to convert between Kafka's byte messages, and Flink's objects. */ protected final KafkaDeserializationSchema deserializer; /** * The set of topic partitions that the source will read, with their initial offsets to start * reading from. */ private Map subscribedPartitionsToStartOffsets; /** * Optional watermark strategy that will be run per Kafka partition, to exploit per-partition * timestamp characteristics. The watermark strategy is kept in serialized form, to deserialize * it into multiple copies. */ private SerializedValue> watermarkStrategy; /** * User-set flag determining whether or not to commit on checkpoints. Note: this flag does not * represent the final offset commit mode. */ private boolean enableCommitOnCheckpoints = true; /** User-set flag to disable filtering restored partitions with current topics descriptor. */ private boolean filterRestoredPartitionsWithCurrentTopicsDescriptor = true; /** User configured value for discovery interval, in milliseconds. */ private final long discoveryIntervalMillis; /** The startup mode for the consumer (default is {@link StartupMode#GROUP_OFFSETS}). */ private StartupMode startupMode = StartupMode.GROUP_OFFSETS; /** * Specific startup offsets; only relevant when startup mode is {@link * StartupMode#SPECIFIC_OFFSETS}. */ private Map specificStartupOffsets; /** * Timestamp to determine startup offsets; only relevant when startup mode is {@link * StartupMode#TIMESTAMP}. */ private Long startupOffsetsTimestamp; // ------------------------------------------------------------------------ // runtime state (used individually by each parallel subtask) // ------------------------------------------------------------------------ /** Data for pending but uncommitted offsets. */ private final LinkedMap pendingOffsetsToCommit = new LinkedMap(); /** The partition discoverer, used to find new partitions. */ private transient volatile AbstractPartitionDiscoverer partitionDiscoverer; // TODO: ------------ start:二次开发代码 ----------------- // /** The fetcher implements the connections to the Kafka brokers. */ protected transient volatile AbstractFetcher kafkaFetcher; /** * The offset commit mode for the consumer. The value of this can only be determined in {@link * FlinkKafkaConsumerBase#open(Configuration)} since it depends on whether or not checkpointing * is enabled for the job. */ protected OffsetCommitMode offsetCommitMode; /** * The offsets to restore to, if the consumer restores state from a checkpoint. * *

This map will be populated by the {@link #initializeState(FunctionInitializationContext)} * method. * *

Using a sorted map as the ordering is important when using restored state to seed the * partition discoverer. */ protected transient volatile TreeMap restoredState; // 是否启用强制的周期性手动提交(区别于自动提交) protected boolean enableForceAutoCommit = false; // 主动周期性提交offset的时间周期 protected long forceAutoCommitIntervalMillis; /** Flag indicating whether the consumer is still running. */ protected volatile boolean running = true; /** * Callback interface that will be invoked upon async Kafka commit completion. Please be aware * that default callback implementation in base class does not provide any guarantees on * thread-safety. This is sufficient for now because current supported Kafka connectors * guarantee no more than 1 concurrent async pending offset commit. */ protected transient KafkaCommitCallback offsetCommitCallback; // TODO: ------------ end:二次开发代码 ----------------- // /** Accessor for state in the operator state backend. */ private transient ListState> unionOffsetStates; /** Discovery loop, executed in a separate thread. */ private transient volatile Thread discoveryLoopThread; // ------------------------------------------------------------------------ // internal metrics // ------------------------------------------------------------------------ /** * Flag indicating whether or not metrics should be exposed. If {@code true}, offset metrics * (e.g. current offset, committed offset) and Kafka-shipped metrics will be registered. */ private final boolean useMetrics; /** Counter for successful Kafka offset commits. */ private transient Counter successfulCommits; /** Counter for failed Kafka offset commits. */ private transient Counter failedCommits; // ------------------------------------------------------------------------ /** * Base constructor. * * @param topics fixed list of topics to subscribe to (null, if using topic pattern) * @param topicPattern the topic pattern to subscribe to (null, if using fixed topics) * @param deserializer The deserializer to turn raw byte messages into Java/Scala objects. * @param discoveryIntervalMillis the topic / partition discovery interval, in milliseconds (0 * if discovery is disabled). */ public FlinkKafkaConsumerBase( List topics, Pattern topicPattern, KafkaDeserializationSchema deserializer, long discoveryIntervalMillis, boolean useMetrics) { this.topicsDescriptor = new KafkaTopicsDescriptor(topics, topicPattern); this.deserializer = checkNotNull(deserializer, "valueDeserializer"); checkArgument( discoveryIntervalMillis == PARTITION_DISCOVERY_DISABLED || discoveryIntervalMillis >= 0, "Cannot define a negative value for the topic / partition discovery interval."); this.discoveryIntervalMillis = discoveryIntervalMillis; this.useMetrics = useMetrics; } /** * Make sure that auto commit is disabled when our offset commit mode is ON_CHECKPOINTS. This * overwrites whatever setting the user configured in the properties. * * @param properties - Kafka configuration properties to be adjusted * @param offsetCommitMode offset commit mode */ protected static void adjustAutoCommitConfig( Properties properties, OffsetCommitMode offsetCommitMode) { if (offsetCommitMode == OffsetCommitMode.ON_CHECKPOINTS || offsetCommitMode == OffsetCommitMode.DISABLED) { properties.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); } } // ------------------------------------------------------------------------ // Configuration // ------------------------------------------------------------------------ /** * Sets the given {@link WatermarkStrategy} on this consumer. These will be used to assign * timestamps to records and generates watermarks to signal event time progress. * *

Running timestamp extractors / watermark generators directly inside the Kafka source * (which you can do by using this method), per Kafka partition, allows users to let them * exploit the per-partition characteristics. * *

When a subtask of a FlinkKafkaConsumer source reads multiple Kafka partitions, the streams * from the partitions are unioned in a "first come first serve" fashion. Per-partition * characteristics are usually lost that way. For example, if the timestamps are strictly * ascending per Kafka partition, they will not be strictly ascending in the resulting Flink * DataStream, if the parallel source subtask reads more than one partition. * *

Common watermark generation patterns can be found as static methods in the {@link * org.apache.flink.api.common.eventtime.WatermarkStrategy} class. * * @return The consumer object, to allow function chaining. */ public FlinkKafkaConsumerBase assignTimestampsAndWatermarks( WatermarkStrategy watermarkStrategy) { checkNotNull(watermarkStrategy); try { ClosureCleaner.clean( watermarkStrategy, ExecutionConfig.ClosureCleanerLevel.RECURSIVE, true); this.watermarkStrategy = new SerializedValue<>(watermarkStrategy); } catch (Exception e) { throw new IllegalArgumentException( "The given WatermarkStrategy is not serializable", e); } return this; } /** * Specifies an {@link AssignerWithPunctuatedWatermarks} to emit watermarks in a punctuated * manner. The watermark extractor will run per Kafka partition, watermarks will be merged * across partitions in the same way as in the Flink runtime, when streams are merged. * *

When a subtask of a FlinkKafkaConsumer source reads multiple Kafka partitions, the streams * from the partitions are unioned in a "first come first serve" fashion. Per-partition * characteristics are usually lost that way. For example, if the timestamps are strictly * ascending per Kafka partition, they will not be strictly ascending in the resulting Flink * DataStream, if the parallel source subtask reads more than one partition. * *

Running timestamp extractors / watermark generators directly inside the Kafka source, per * Kafka partition, allows users to let them exploit the per-partition characteristics. * *

Note: One can use either an {@link AssignerWithPunctuatedWatermarks} or an {@link * AssignerWithPeriodicWatermarks}, not both at the same time. * *

This method uses the deprecated watermark generator interfaces. Please switch to {@link * #assignTimestampsAndWatermarks(WatermarkStrategy)} to use the new interfaces instead. The new * interfaces support watermark idleness and no longer need to differentiate between "periodic" * and "punctuated" watermarks. * * @deprecated Please use {@link #assignTimestampsAndWatermarks(WatermarkStrategy)} instead. * @param assigner The timestamp assigner / watermark generator to use. * @return The consumer object, to allow function chaining. */ @Deprecated public FlinkKafkaConsumerBase assignTimestampsAndWatermarks( AssignerWithPunctuatedWatermarks assigner) { checkNotNull(assigner); if (this.watermarkStrategy != null) { throw new IllegalStateException("Some watermark strategy has already been set."); } try { ClosureCleaner.clean(assigner, ExecutionConfig.ClosureCleanerLevel.RECURSIVE, true); final WatermarkStrategy wms = new AssignerWithPunctuatedWatermarksAdapter.Strategy<>(assigner); return assignTimestampsAndWatermarks(wms); } catch (Exception e) { throw new IllegalArgumentException("The given assigner is not serializable", e); } } /** * Specifies an {@link AssignerWithPunctuatedWatermarks} to emit watermarks in a punctuated * manner. The watermark extractor will run per Kafka partition, watermarks will be merged * across partitions in the same way as in the Flink runtime, when streams are merged. * *

When a subtask of a FlinkKafkaConsumer source reads multiple Kafka partitions, the streams * from the partitions are unioned in a "first come first serve" fashion. Per-partition * characteristics are usually lost that way. For example, if the timestamps are strictly * ascending per Kafka partition, they will not be strictly ascending in the resulting Flink * DataStream, if the parallel source subtask reads more that one partition. * *

Running timestamp extractors / watermark generators directly inside the Kafka source, per * Kafka partition, allows users to let them exploit the per-partition characteristics. * *

Note: One can use either an {@link AssignerWithPunctuatedWatermarks} or an {@link * AssignerWithPeriodicWatermarks}, not both at the same time. * *

This method uses the deprecated watermark generator interfaces. Please switch to {@link * #assignTimestampsAndWatermarks(WatermarkStrategy)} to use the new interfaces instead. The new * interfaces support watermark idleness and no longer need to differentiate between "periodic" * and "punctuated" watermarks. * * @deprecated Please use {@link #assignTimestampsAndWatermarks(WatermarkStrategy)} instead. * @param assigner The timestamp assigner / watermark generator to use. * @return The consumer object, to allow function chaining. */ @Deprecated public FlinkKafkaConsumerBase assignTimestampsAndWatermarks( AssignerWithPeriodicWatermarks assigner) { checkNotNull(assigner); if (this.watermarkStrategy != null) { throw new IllegalStateException("Some watermark strategy has already been set."); } try { ClosureCleaner.clean(assigner, ExecutionConfig.ClosureCleanerLevel.RECURSIVE, true); final WatermarkStrategy wms = new AssignerWithPeriodicWatermarksAdapter.Strategy<>(assigner); return assignTimestampsAndWatermarks(wms); } catch (Exception e) { throw new IllegalArgumentException("The given assigner is not serializable", e); } } /** * Specifies whether or not the consumer should commit offsets back to Kafka on checkpoints. * *

This setting will only have effect if checkpointing is enabled for the job. If * checkpointing isn't enabled, only the "auto.commit.enable" (for 0.8) / "enable.auto.commit" * (for 0.9+) property settings will be used. * * @return The consumer object, to allow function chaining. */ public FlinkKafkaConsumerBase setCommitOffsetsOnCheckpoints(boolean commitOnCheckpoints) { this.enableCommitOnCheckpoints = commitOnCheckpoints; return this; } /** * Specifies the consumer to start reading from the earliest offset for all partitions. This * lets the consumer ignore any committed group offsets in Zookeeper / Kafka brokers. * *

This method does not affect where partitions are read from when the consumer is restored * from a checkpoint or savepoint. When the consumer is restored from a checkpoint or savepoint, * only the offsets in the restored state will be used. * * @return The consumer object, to allow function chaining. */ public FlinkKafkaConsumerBase setStartFromEarliest() { this.startupMode = StartupMode.EARLIEST; this.startupOffsetsTimestamp = null; this.specificStartupOffsets = null; return this; } /** * Specifies the consumer to start reading from the latest offset for all partitions. This lets * the consumer ignore any committed group offsets in Zookeeper / Kafka brokers. * *

This method does not affect where partitions are read from when the consumer is restored * from a checkpoint or savepoint. When the consumer is restored from a checkpoint or savepoint, * only the offsets in the restored state will be used. * * @return The consumer object, to allow function chaining. */ public FlinkKafkaConsumerBase setStartFromLatest() { this.startupMode = StartupMode.LATEST; this.startupOffsetsTimestamp = null; this.specificStartupOffsets = null; return this; } /** * Specifies the consumer to start reading partitions from a specified timestamp. The specified * timestamp must be before the current timestamp. This lets the consumer ignore any committed * group offsets in Zookeeper / Kafka brokers. * *

The consumer will look up the earliest offset whose timestamp is greater than or equal to * the specific timestamp from Kafka. If there's no such offset, the consumer will use the * latest offset to read data from kafka. * *

This method does not affect where partitions are read from when the consumer is restored * from a checkpoint or savepoint. When the consumer is restored from a checkpoint or savepoint, * only the offsets in the restored state will be used. * * @param startupOffsetsTimestamp timestamp for the startup offsets, as milliseconds from epoch. * @return The consumer object, to allow function chaining. */ public FlinkKafkaConsumerBase setStartFromTimestamp(long startupOffsetsTimestamp) { checkArgument( startupOffsetsTimestamp >= 0, "The provided value for the startup offsets timestamp is invalid."); long currentTimestamp = System.currentTimeMillis(); checkArgument( startupOffsetsTimestamp <= currentTimestamp, "Startup time[%s] must be before current time[%s].", startupOffsetsTimestamp, currentTimestamp); this.startupMode = StartupMode.TIMESTAMP; this.startupOffsetsTimestamp = startupOffsetsTimestamp; this.specificStartupOffsets = null; return this; } /** * Specifies the consumer to start reading from any committed group offsets found in Zookeeper / * Kafka brokers. The "group.id" property must be set in the configuration properties. If no * offset can be found for a partition, the behaviour in "auto.offset.reset" set in the * configuration properties will be used for the partition. * *

This method does not affect where partitions are read from when the consumer is restored * from a checkpoint or savepoint. When the consumer is restored from a checkpoint or savepoint, * only the offsets in the restored state will be used. * * @return The consumer object, to allow function chaining. */ public FlinkKafkaConsumerBase setStartFromGroupOffsets() { this.startupMode = StartupMode.GROUP_OFFSETS; this.startupOffsetsTimestamp = null; this.specificStartupOffsets = null; return this; } /** * Specifies the consumer to start reading partitions from specific offsets, set independently * for each partition. The specified offset should be the offset of the next record that will be * read from partitions. This lets the consumer ignore any committed group offsets in Zookeeper * / Kafka brokers. * *

If the provided map of offsets contains entries whose {@link KafkaTopicPartition} is not * subscribed by the consumer, the entry will be ignored. If the consumer subscribes to a * partition that does not exist in the provided map of offsets, the consumer will fallback to * the default group offset behaviour (see {@link * FlinkKafkaConsumerBase#setStartFromGroupOffsets()}) for that particular partition. * *

If the specified offset for a partition is invalid, or the behaviour for that partition is * defaulted to group offsets but still no group offset could be found for it, then the * "auto.offset.reset" behaviour set in the configuration properties will be used for the * partition * *

This method does not affect where partitions are read from when the consumer is restored * from a checkpoint or savepoint. When the consumer is restored from a checkpoint or savepoint, * only the offsets in the restored state will be used. * * @return The consumer object, to allow function chaining. */ public FlinkKafkaConsumerBase setStartFromSpecificOffsets( Map specificStartupOffsets) { this.startupMode = StartupMode.SPECIFIC_OFFSETS; this.startupOffsetsTimestamp = null; this.specificStartupOffsets = checkNotNull(specificStartupOffsets); return this; } /** * By default, when restoring from a checkpoint / savepoint, the consumer always ignores * restored partitions that are no longer associated with the current specified topics or topic * pattern to subscribe to. * *

This method configures the consumer to not filter the restored partitions, therefore * always attempting to consume whatever partition was present in the previous execution * regardless of the specified topics to subscribe to in the current execution. * * @return The consumer object, to allow function chaining. */ public FlinkKafkaConsumerBase disableFilterRestoredPartitionsWithSubscribedTopics() { this.filterRestoredPartitionsWithCurrentTopicsDescriptor = false; return this; } // ------------------------------------------------------------------------ // Work methods // ------------------------------------------------------------------------ @Override public void open(Configuration configuration) throws Exception { // determine the offset commit mode this.offsetCommitMode = OffsetCommitModes.fromConfiguration( getIsAutoCommitEnabled(), enableCommitOnCheckpoints, ((StreamingRuntimeContext) getRuntimeContext()).isCheckpointingEnabled()); // create the partition discoverer this.partitionDiscoverer = createPartitionDiscoverer( topicsDescriptor, getRuntimeContext().getIndexOfThisSubtask(), getRuntimeContext().getNumberOfParallelSubtasks()); this.partitionDiscoverer.open(); subscribedPartitionsToStartOffsets = new HashMap<>(); final List allPartitions = partitionDiscoverer.discoverPartitions(); if (restoredState != null) { for (KafkaTopicPartition partition : allPartitions) { if (!restoredState.containsKey(partition)) { restoredState.put(partition, KafkaTopicPartitionStateSentinel.EARLIEST_OFFSET); } } for (Map.Entry restoredStateEntry : restoredState.entrySet()) { // seed the partition discoverer with the union state while filtering out // restored partitions that should not be subscribed by this subtask if (KafkaTopicPartitionAssigner.assign( restoredStateEntry.getKey(), getRuntimeContext().getNumberOfParallelSubtasks()) == getRuntimeContext().getIndexOfThisSubtask()) { subscribedPartitionsToStartOffsets.put( restoredStateEntry.getKey(), restoredStateEntry.getValue()); } } if (filterRestoredPartitionsWithCurrentTopicsDescriptor) { subscribedPartitionsToStartOffsets .entrySet() .removeIf( entry -> { if (!topicsDescriptor.isMatchingTopic( entry.getKey().getTopic())) { LOG.warn( "{} is removed from subscribed partitions since it is no longer associated with topics descriptor of current execution.", entry.getKey()); return true; } return false; }); } LOG.info( "Consumer subtask {} will start reading {} partitions with offsets in restored state: {}", getRuntimeContext().getIndexOfThisSubtask(), subscribedPartitionsToStartOffsets.size(), subscribedPartitionsToStartOffsets); } else { // use the partition discoverer to fetch the initial seed partitions, // and set their initial offsets depending on the startup mode. // for SPECIFIC_OFFSETS and TIMESTAMP modes, we set the specific offsets now; // for other modes (EARLIEST, LATEST, and GROUP_OFFSETS), the offset is lazily // determined // when the partition is actually read. switch (startupMode) { case SPECIFIC_OFFSETS: if (specificStartupOffsets == null) { throw new IllegalStateException( "Startup mode for the consumer set to " + StartupMode.SPECIFIC_OFFSETS + ", but no specific offsets were specified."); } for (KafkaTopicPartition seedPartition : allPartitions) { Long specificOffset = specificStartupOffsets.get(seedPartition); if (specificOffset != null) { // since the specified offsets represent the next record to read, we // subtract // it by one so that the initial state of the consumer will be correct subscribedPartitionsToStartOffsets.put( seedPartition, specificOffset - 1); } else { // default to group offset behaviour if the user-provided specific // offsets // do not contain a value for this partition subscribedPartitionsToStartOffsets.put( seedPartition, KafkaTopicPartitionStateSentinel.GROUP_OFFSET); } } break; case TIMESTAMP: if (startupOffsetsTimestamp == null) { throw new IllegalStateException( "Startup mode for the consumer set to " + StartupMode.TIMESTAMP + ", but no startup timestamp was specified."); } for (Map.Entry partitionToOffset : fetchOffsetsWithTimestamp(allPartitions, startupOffsetsTimestamp) .entrySet()) { subscribedPartitionsToStartOffsets.put( partitionToOffset.getKey(), (partitionToOffset.getValue() == null) // if an offset cannot be retrieved for a partition with the // given timestamp, // we default to using the latest offset for the partition ? KafkaTopicPartitionStateSentinel.LATEST_OFFSET // since the specified offsets represent the next record to // read, we subtract // it by one so that the initial state of the consumer will // be correct : partitionToOffset.getValue() - 1); } break; default: for (KafkaTopicPartition seedPartition : allPartitions) { subscribedPartitionsToStartOffsets.put( seedPartition, startupMode.getStateSentinel()); } } if (!subscribedPartitionsToStartOffsets.isEmpty()) { switch (startupMode) { case EARLIEST: LOG.info( "Consumer subtask {} will start reading the following {} partitions from the earliest offsets: {}", getRuntimeContext().getIndexOfThisSubtask(), subscribedPartitionsToStartOffsets.size(), subscribedPartitionsToStartOffsets.keySet()); break; case LATEST: LOG.info( "Consumer subtask {} will start reading the following {} partitions from the latest offsets: {}", getRuntimeContext().getIndexOfThisSubtask(), subscribedPartitionsToStartOffsets.size(), subscribedPartitionsToStartOffsets.keySet()); break; case TIMESTAMP: LOG.info( "Consumer subtask {} will start reading the following {} partitions from timestamp {}: {}", getRuntimeContext().getIndexOfThisSubtask(), subscribedPartitionsToStartOffsets.size(), startupOffsetsTimestamp, subscribedPartitionsToStartOffsets.keySet()); break; case SPECIFIC_OFFSETS: LOG.info( "Consumer subtask {} will start reading the following {} partitions from the specified startup offsets {}: {}", getRuntimeContext().getIndexOfThisSubtask(), subscribedPartitionsToStartOffsets.size(), specificStartupOffsets, subscribedPartitionsToStartOffsets.keySet()); List partitionsDefaultedToGroupOffsets = new ArrayList<>(subscribedPartitionsToStartOffsets.size()); for (Map.Entry subscribedPartition : subscribedPartitionsToStartOffsets.entrySet()) { if (subscribedPartition.getValue() == KafkaTopicPartitionStateSentinel.GROUP_OFFSET) { partitionsDefaultedToGroupOffsets.add(subscribedPartition.getKey()); } } if (partitionsDefaultedToGroupOffsets.size() > 0) { LOG.warn( "Consumer subtask {} cannot find offsets for the following {} partitions in the specified startup offsets: {}" + "; their startup offsets will be defaulted to their committed group offsets in Kafka.", getRuntimeContext().getIndexOfThisSubtask(), partitionsDefaultedToGroupOffsets.size(), partitionsDefaultedToGroupOffsets); } break; case GROUP_OFFSETS: LOG.info( "Consumer subtask {} will start reading the following {} partitions from the committed group offsets in Kafka: {}", getRuntimeContext().getIndexOfThisSubtask(), subscribedPartitionsToStartOffsets.size(), subscribedPartitionsToStartOffsets.keySet()); } } else { LOG.info( "Consumer subtask {} initially has no partitions to read from.", getRuntimeContext().getIndexOfThisSubtask()); } } this.deserializer.open( RuntimeContextInitializationContextAdapters.deserializationAdapter( getRuntimeContext(), metricGroup -> metricGroup.addGroup("user"))); } @Override public void run(SourceContext sourceContext) throws Exception { if (subscribedPartitionsToStartOffsets == null) { throw new Exception("The partitions were not set for the consumer"); } // initialize commit metrics and default offset callback method this.successfulCommits = this.getRuntimeContext() .getMetricGroup() .counter(COMMITS_SUCCEEDED_METRICS_COUNTER); this.failedCommits = this.getRuntimeContext().getMetricGroup().counter(COMMITS_FAILED_METRICS_COUNTER); final int subtaskIndex = this.getRuntimeContext().getIndexOfThisSubtask(); this.offsetCommitCallback = new KafkaCommitCallback() { @Override public void onSuccess() { successfulCommits.inc(); } @Override public void onException(Throwable cause) { LOG.warn( String.format( "Consumer subtask %d failed async Kafka commit.", subtaskIndex), cause); failedCommits.inc(); } }; // mark the subtask as temporarily idle if there are no initial seed partitions; // once this subtask discovers some partitions and starts collecting records, the subtask's // status will automatically be triggered back to be active. if (subscribedPartitionsToStartOffsets.isEmpty()) { sourceContext.markAsTemporarilyIdle(); } LOG.info( "Consumer subtask {} creating fetcher with offsets {}.", getRuntimeContext().getIndexOfThisSubtask(), subscribedPartitionsToStartOffsets); // from this point forward: // - 'snapshotState' will draw offsets from the fetcher, // instead of being built from `subscribedPartitionsToStartOffsets` // - 'notifyCheckpointComplete' will start to do work (i.e. commit offsets to // Kafka through the fetcher, if configured to do so) this.kafkaFetcher = createFetcher( sourceContext, subscribedPartitionsToStartOffsets, watermarkStrategy, (StreamingRuntimeContext) getRuntimeContext(), offsetCommitMode, getRuntimeContext().getMetricGroup().addGroup(KAFKA_CONSUMER_METRICS_GROUP), useMetrics); if (!running) { return; } // depending on whether we were restored with the current state version (1.3), // remaining logic branches off into 2 paths: // 1) New state - partition discovery loop executed as separate thread, with this // thread running the main fetcher loop // 2) Old state - partition discovery is disabled and only the main fetcher loop is // executed if (discoveryIntervalMillis == PARTITION_DISCOVERY_DISABLED) { kafkaFetcher.runFetchLoop(); } else { runWithPartitionDiscovery(); } } private void runWithPartitionDiscovery() throws Exception { final AtomicReference discoveryLoopErrorRef = new AtomicReference<>(); createAndStartDiscoveryLoop(discoveryLoopErrorRef); kafkaFetcher.runFetchLoop(); // make sure that the partition discoverer is waked up so that // the discoveryLoopThread exits partitionDiscoverer.wakeup(); joinDiscoveryLoopThread(); // rethrow any fetcher errors final Exception discoveryLoopError = discoveryLoopErrorRef.get(); if (discoveryLoopError != null) { throw new RuntimeException(discoveryLoopError); } } @VisibleForTesting void joinDiscoveryLoopThread() throws InterruptedException { if (discoveryLoopThread != null) { discoveryLoopThread.join(); } } private void createAndStartDiscoveryLoop(AtomicReference discoveryLoopErrorRef) { discoveryLoopThread = new Thread( () -> { try { // --------------------- partition discovery loop // --------------------- // throughout the loop, we always eagerly check if we are still // running before // performing the next operation, so that we can escape the loop as // soon as possible while (running) { if (LOG.isDebugEnabled()) { LOG.debug( "Consumer subtask {} is trying to discover new partitions ...", getRuntimeContext().getIndexOfThisSubtask()); } final List discoveredPartitions; try { discoveredPartitions = partitionDiscoverer.discoverPartitions(); } catch (AbstractPartitionDiscoverer.WakeupException | AbstractPartitionDiscoverer.ClosedException e) { // the partition discoverer may have been closed or woken up // before or during the discovery; // this would only happen if the consumer was canceled; // simply escape the loop break; } // no need to add the discovered partitions if we were closed // during the meantime if (running && !discoveredPartitions.isEmpty()) { kafkaFetcher.addDiscoveredPartitions(discoveredPartitions); } // do not waste any time sleeping if we're not running anymore if (running && discoveryIntervalMillis != 0) { try { Thread.sleep(discoveryIntervalMillis); } catch (InterruptedException iex) { // may be interrupted if the consumer was canceled // midway; simply escape the loop break; } } } } catch (Exception e) { discoveryLoopErrorRef.set(e); } finally { // calling cancel will also let the fetcher loop escape // (if not running, cancel() was already called) if (running) { cancel(); } } }, "Kafka Partition Discovery for " + getRuntimeContext().getTaskNameWithSubtasks()); discoveryLoopThread.start(); } @Override public void cancel() { // set ourselves as not running; // this would let the main discovery loop escape as soon as possible running = false; if (discoveryLoopThread != null) { if (partitionDiscoverer != null) { // we cannot close the discoverer here, as it is error-prone to concurrent access; // only wakeup the discoverer, the discovery loop will clean itself up after it // escapes partitionDiscoverer.wakeup(); } // the discovery loop may currently be sleeping in-between // consecutive discoveries; interrupt to shutdown faster discoveryLoopThread.interrupt(); } // abort the fetcher, if there is one if (kafkaFetcher != null) { kafkaFetcher.cancel(); } } @Override public void close() throws Exception { cancel(); joinDiscoveryLoopThread(); Exception exception = null; if (partitionDiscoverer != null) { try { partitionDiscoverer.close(); } catch (Exception e) { exception = e; } } try { super.close(); } catch (Exception e) { exception = ExceptionUtils.firstOrSuppressed(e, exception); } if (exception != null) { throw exception; } } // ------------------------------------------------------------------------ // Checkpoint and restore // ------------------------------------------------------------------------ @Override public final void initializeState(FunctionInitializationContext context) throws Exception { OperatorStateStore stateStore = context.getOperatorStateStore(); this.unionOffsetStates = stateStore.getUnionListState( new ListStateDescriptor<>( OFFSETS_STATE_NAME, createStateSerializer(getRuntimeContext().getExecutionConfig()))); if (context.isRestored()) { restoredState = new TreeMap<>(new KafkaTopicPartition.Comparator()); // populate actual holder for restored state for (Tuple2 kafkaOffset : unionOffsetStates.get()) { restoredState.put(kafkaOffset.f0, kafkaOffset.f1); } LOG.info( "Consumer subtask {} restored state: {}.", getRuntimeContext().getIndexOfThisSubtask(), restoredState); } else { LOG.info( "Consumer subtask {} has no restore state.", getRuntimeContext().getIndexOfThisSubtask()); } } @Override public final void snapshotState(FunctionSnapshotContext context) throws Exception { if (!running) { LOG.debug("snapshotState() called on closed source"); } else { unionOffsetStates.clear(); final AbstractFetcher fetcher = this.kafkaFetcher; if (fetcher == null) { // the fetcher has not yet been initialized, which means we need to return the // originally restored offsets or the assigned partitions for (Map.Entry subscribedPartition : subscribedPartitionsToStartOffsets.entrySet()) { unionOffsetStates.add( Tuple2.of( subscribedPartition.getKey(), subscribedPartition.getValue())); } if (offsetCommitMode == OffsetCommitMode.ON_CHECKPOINTS) { // the map cannot be asynchronously updated, because only one checkpoint call // can happen // on this function at a time: either snapshotState() or // notifyCheckpointComplete() pendingOffsetsToCommit.put(context.getCheckpointId(), restoredState); } } else { HashMap currentOffsets = fetcher.snapshotCurrentState(); if (offsetCommitMode == OffsetCommitMode.ON_CHECKPOINTS) { // the map cannot be asynchronously updated, because only one checkpoint call // can happen // on this function at a time: either snapshotState() or // notifyCheckpointComplete() pendingOffsetsToCommit.put(context.getCheckpointId(), currentOffsets); } for (Map.Entry kafkaTopicPartitionLongEntry : currentOffsets.entrySet()) { unionOffsetStates.add( Tuple2.of( kafkaTopicPartitionLongEntry.getKey(), kafkaTopicPartitionLongEntry.getValue())); } } if (offsetCommitMode == OffsetCommitMode.ON_CHECKPOINTS) { // truncate the map of pending offsets to commit, to prevent infinite growth while (pendingOffsetsToCommit.size() > MAX_NUM_PENDING_CHECKPOINTS) { pendingOffsetsToCommit.remove(0); } } } } @Override public final void notifyCheckpointComplete(long checkpointId) throws Exception { if (!running) { LOG.debug("notifyCheckpointComplete() called on closed source"); return; } final AbstractFetcher fetcher = this.kafkaFetcher; if (fetcher == null) { LOG.debug("notifyCheckpointComplete() called on uninitialized source"); return; } if (offsetCommitMode == OffsetCommitMode.ON_CHECKPOINTS) { // only one commit operation must be in progress if (LOG.isDebugEnabled()) { LOG.debug( "Consumer subtask {} committing offsets to Kafka/ZooKeeper for checkpoint {}.", getRuntimeContext().getIndexOfThisSubtask(), checkpointId); } try { final int posInMap = pendingOffsetsToCommit.indexOf(checkpointId); if (posInMap == -1) { LOG.warn( "Consumer subtask {} received confirmation for unknown checkpoint id {}", getRuntimeContext().getIndexOfThisSubtask(), checkpointId); return; } @SuppressWarnings("unchecked") Map offsets = (Map) pendingOffsetsToCommit.remove(posInMap); // remove older checkpoints in map for (int i = 0; i < posInMap; i++) { pendingOffsetsToCommit.remove(0); } if (offsets == null || offsets.size() == 0) { LOG.debug( "Consumer subtask {} has empty checkpoint state.", getRuntimeContext().getIndexOfThisSubtask()); return; } fetcher.commitInternalOffsetsToKafka(offsets, offsetCommitCallback); } catch (Exception e) { if (running) { throw e; } // else ignore exception if we are no longer running } } } @Override public void notifyCheckpointAborted(long checkpointId) {} // ------------------------------------------------------------------------ // Kafka Consumer specific methods // ------------------------------------------------------------------------ /** * Creates the fetcher that connect to the Kafka brokers, pulls data, deserialized the data, and * emits it into the data streams. * * @param sourceContext The source context to emit data to. * @param subscribedPartitionsToStartOffsets The set of partitions that this subtask should * handle, with their start offsets. * @param watermarkStrategy Optional, a serialized WatermarkStrategy. * @param runtimeContext The task's runtime context. * @return The instantiated fetcher * @throws Exception The method should forward exceptions */ protected abstract AbstractFetcher createFetcher( SourceContext sourceContext, Map subscribedPartitionsToStartOffsets, SerializedValue> watermarkStrategy, StreamingRuntimeContext runtimeContext, OffsetCommitMode offsetCommitMode, MetricGroup kafkaMetricGroup, boolean useMetrics) throws Exception; /** * Creates the partition discoverer that is used to find new partitions for this subtask. * * @param topicsDescriptor Descriptor that describes whether we are discovering partitions for * fixed topics or a topic pattern. * @param indexOfThisSubtask The index of this consumer subtask. * @param numParallelSubtasks The total number of parallel consumer subtasks. * @return The instantiated partition discoverer */ protected abstract AbstractPartitionDiscoverer createPartitionDiscoverer( KafkaTopicsDescriptor topicsDescriptor, int indexOfThisSubtask, int numParallelSubtasks); protected abstract boolean getIsAutoCommitEnabled(); protected abstract Map fetchOffsetsWithTimestamp( Collection partitions, long timestamp); // ------------------------------------------------------------------------ // ResultTypeQueryable methods // ------------------------------------------------------------------------ @Override public TypeInformation getProducedType() { return deserializer.getProducedType(); } // ------------------------------------------------------------------------ // Test utilities // ------------------------------------------------------------------------ @VisibleForTesting Map getSubscribedPartitionsToStartOffsets() { return subscribedPartitionsToStartOffsets; } @VisibleForTesting TreeMap getRestoredState() { return restoredState; } @VisibleForTesting OffsetCommitMode getOffsetCommitMode() { return offsetCommitMode; } @VisibleForTesting LinkedMap getPendingOffsetsToCommit() { return pendingOffsetsToCommit; } @VisibleForTesting public boolean getEnableCommitOnCheckpoints() { return enableCommitOnCheckpoints; } /** * Creates state serializer for kafka topic partition to offset tuple. Using of the explicit * state serializer with KryoSerializer is needed because otherwise users cannot use * 'disableGenericTypes' properties with KafkaConsumer. */ @VisibleForTesting static TupleSerializer> createStateSerializer( ExecutionConfig executionConfig) { // explicit serializer will keep the compatibility with GenericTypeInformation and allow to // disableGenericTypes for users TypeSerializer[] fieldSerializers = new TypeSerializer[] { new KryoSerializer<>(KafkaTopicPartition.class, executionConfig), LongSerializer.INSTANCE }; @SuppressWarnings("unchecked") Class> tupleClass = (Class>) (Class) Tuple2.class; return new TupleSerializer<>(tupleClass, fieldSerializers); } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.14/org/apache/flink/table/api/internal/TableEnvironmentImpl.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.table.api.internal; import org.apache.flink.annotation.Internal; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.JobExecutionResult; import org.apache.flink.api.dag.Pipeline; import org.apache.flink.api.dag.Transformation; import org.apache.flink.configuration.Configuration; import org.apache.flink.core.execution.JobClient; import org.apache.flink.table.api.DataTypes; import org.apache.flink.table.api.EnvironmentSettings; import org.apache.flink.table.api.ExplainDetail; import org.apache.flink.table.api.ResultKind; import org.apache.flink.table.api.SqlParserException; import org.apache.flink.table.api.StatementSet; import org.apache.flink.table.api.Table; import org.apache.flink.table.api.TableConfig; import org.apache.flink.table.api.TableDescriptor; import org.apache.flink.table.api.TableEnvironment; import org.apache.flink.table.api.TableException; import org.apache.flink.table.api.TableResult; import org.apache.flink.table.api.TableSchema; import org.apache.flink.table.api.ValidationException; import org.apache.flink.table.catalog.Catalog; import org.apache.flink.table.catalog.CatalogBaseTable; import org.apache.flink.table.catalog.CatalogFunction; import org.apache.flink.table.catalog.CatalogManager; import org.apache.flink.table.catalog.CatalogPartition; import org.apache.flink.table.catalog.CatalogPartitionSpec; import org.apache.flink.table.catalog.CatalogTable; import org.apache.flink.table.catalog.CatalogTableImpl; import org.apache.flink.table.catalog.Column; import org.apache.flink.table.catalog.ConnectorCatalogTable; import org.apache.flink.table.catalog.FunctionCatalog; import org.apache.flink.table.catalog.GenericInMemoryCatalog; import org.apache.flink.table.catalog.ObjectIdentifier; import org.apache.flink.table.catalog.ObjectPath; import org.apache.flink.table.catalog.QueryOperationCatalogView; import org.apache.flink.table.catalog.ResolvedCatalogBaseTable; import org.apache.flink.table.catalog.ResolvedCatalogTable; import org.apache.flink.table.catalog.ResolvedSchema; import org.apache.flink.table.catalog.UnresolvedIdentifier; import org.apache.flink.table.catalog.WatermarkSpec; import org.apache.flink.table.catalog.exceptions.CatalogException; import org.apache.flink.table.catalog.exceptions.DatabaseAlreadyExistException; import org.apache.flink.table.catalog.exceptions.DatabaseNotEmptyException; import org.apache.flink.table.catalog.exceptions.DatabaseNotExistException; import org.apache.flink.table.catalog.exceptions.FunctionAlreadyExistException; import org.apache.flink.table.catalog.exceptions.FunctionNotExistException; import org.apache.flink.table.catalog.exceptions.TableAlreadyExistException; import org.apache.flink.table.catalog.exceptions.TableNotExistException; import org.apache.flink.table.delegation.Executor; import org.apache.flink.table.delegation.ExecutorFactory; import org.apache.flink.table.delegation.Parser; import org.apache.flink.table.delegation.Planner; import org.apache.flink.table.expressions.ApiExpressionUtils; import org.apache.flink.table.expressions.Expression; import org.apache.flink.table.factories.FactoryUtil; import org.apache.flink.table.factories.PlannerFactoryUtil; import org.apache.flink.table.functions.ScalarFunction; import org.apache.flink.table.functions.UserDefinedFunction; import org.apache.flink.table.functions.UserDefinedFunctionHelper; import org.apache.flink.table.module.Module; import org.apache.flink.table.module.ModuleEntry; import org.apache.flink.table.module.ModuleManager; import org.apache.flink.table.operations.CatalogQueryOperation; import org.apache.flink.table.operations.CatalogSinkModifyOperation; import org.apache.flink.table.operations.CollectModifyOperation; import org.apache.flink.table.operations.DescribeTableOperation; import org.apache.flink.table.operations.ExplainOperation; import org.apache.flink.table.operations.LoadModuleOperation; import org.apache.flink.table.operations.ModifyOperation; import org.apache.flink.table.operations.NopOperation; import org.apache.flink.table.operations.Operation; import org.apache.flink.table.operations.QueryOperation; import org.apache.flink.table.operations.ShowCatalogsOperation; import org.apache.flink.table.operations.ShowCreateTableOperation; import org.apache.flink.table.operations.ShowCurrentCatalogOperation; import org.apache.flink.table.operations.ShowCurrentDatabaseOperation; import org.apache.flink.table.operations.ShowDatabasesOperation; import org.apache.flink.table.operations.ShowFunctionsOperation; import org.apache.flink.table.operations.ShowModulesOperation; import org.apache.flink.table.operations.ShowPartitionsOperation; import org.apache.flink.table.operations.ShowTablesOperation; import org.apache.flink.table.operations.ShowViewsOperation; import org.apache.flink.table.operations.TableSourceQueryOperation; import org.apache.flink.table.operations.UnloadModuleOperation; import org.apache.flink.table.operations.UseCatalogOperation; import org.apache.flink.table.operations.UseDatabaseOperation; import org.apache.flink.table.operations.UseModulesOperation; import org.apache.flink.table.operations.ddl.AddPartitionsOperation; import org.apache.flink.table.operations.ddl.AlterCatalogFunctionOperation; import org.apache.flink.table.operations.ddl.AlterDatabaseOperation; import org.apache.flink.table.operations.ddl.AlterPartitionPropertiesOperation; import org.apache.flink.table.operations.ddl.AlterTableAddConstraintOperation; import org.apache.flink.table.operations.ddl.AlterTableDropConstraintOperation; import org.apache.flink.table.operations.ddl.AlterTableOperation; import org.apache.flink.table.operations.ddl.AlterTableOptionsOperation; import org.apache.flink.table.operations.ddl.AlterTableRenameOperation; import org.apache.flink.table.operations.ddl.AlterTableSchemaOperation; import org.apache.flink.table.operations.ddl.AlterViewAsOperation; import org.apache.flink.table.operations.ddl.AlterViewOperation; import org.apache.flink.table.operations.ddl.AlterViewPropertiesOperation; import org.apache.flink.table.operations.ddl.AlterViewRenameOperation; import org.apache.flink.table.operations.ddl.CreateCatalogFunctionOperation; import org.apache.flink.table.operations.ddl.CreateCatalogOperation; import org.apache.flink.table.operations.ddl.CreateDatabaseOperation; import org.apache.flink.table.operations.ddl.CreateTableASOperation; import org.apache.flink.table.operations.ddl.CreateTableOperation; import org.apache.flink.table.operations.ddl.CreateTempSystemFunctionOperation; import org.apache.flink.table.operations.ddl.CreateViewOperation; import org.apache.flink.table.operations.ddl.DropCatalogFunctionOperation; import org.apache.flink.table.operations.ddl.DropCatalogOperation; import org.apache.flink.table.operations.ddl.DropDatabaseOperation; import org.apache.flink.table.operations.ddl.DropPartitionsOperation; import org.apache.flink.table.operations.ddl.DropTableOperation; import org.apache.flink.table.operations.ddl.DropTempSystemFunctionOperation; import org.apache.flink.table.operations.ddl.DropViewOperation; import org.apache.flink.table.operations.utils.OperationTreeBuilder; import org.apache.flink.table.sinks.TableSink; import org.apache.flink.table.sources.TableSource; import org.apache.flink.table.sources.TableSourceValidation; import org.apache.flink.table.types.AbstractDataType; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.utils.EncodingUtils; import org.apache.flink.table.utils.PrintUtils; import org.apache.flink.table.utils.TableSchemaUtils; import org.apache.flink.types.Row; import org.apache.flink.util.Preconditions; import org.apache.commons.lang3.StringUtils; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.concurrent.ExecutionException; import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Collectors; import java.util.stream.StreamSupport; import static org.apache.flink.table.api.config.TableConfigOptions.TABLE_DML_SYNC; /** * Implementation of {@link TableEnvironment} that works exclusively with Table API interfaces. Only * {@link TableSource} is supported as an input and {@link TableSink} as an output. It also does not * bind to any particular {@code StreamExecutionEnvironment}. */ @Internal public class TableEnvironmentImpl implements TableEnvironmentInternal { // Flag that tells if the TableSource/TableSink used in this environment is stream table // source/sink, // and this should always be true. This avoids too many hard code. private static final boolean IS_STREAM_TABLE = true; private final CatalogManager catalogManager; private final ModuleManager moduleManager; private final OperationTreeBuilder operationTreeBuilder; private final List bufferedModifyOperations = new ArrayList<>(); protected final TableConfig tableConfig; protected final Executor execEnv; protected final FunctionCatalog functionCatalog; protected final Planner planner; private final boolean isStreamingMode; private final ClassLoader userClassLoader; private static final String UNSUPPORTED_QUERY_IN_SQL_UPDATE_MSG = "Unsupported SQL query! sqlUpdate() only accepts a single SQL statement of type " + "INSERT, CREATE TABLE, DROP TABLE, ALTER TABLE, USE CATALOG, USE [CATALOG.]DATABASE, " + "CREATE DATABASE, DROP DATABASE, ALTER DATABASE, CREATE FUNCTION, DROP FUNCTION, ALTER FUNCTION, " + "CREATE CATALOG, DROP CATALOG, CREATE VIEW, DROP VIEW, LOAD MODULE, UNLOAD " + "MODULE, USE MODULES."; private static final String UNSUPPORTED_QUERY_IN_EXECUTE_SQL_MSG = "Unsupported SQL query! executeSql() only accepts a single SQL statement of type " + "CREATE TABLE, DROP TABLE, ALTER TABLE, CREATE DATABASE, DROP DATABASE, ALTER DATABASE, " + "CREATE FUNCTION, DROP FUNCTION, ALTER FUNCTION, CREATE CATALOG, DROP CATALOG, " + "USE CATALOG, USE [CATALOG.]DATABASE, SHOW CATALOGS, SHOW DATABASES, SHOW TABLES, SHOW [USER] FUNCTIONS, SHOW PARTITIONS" + "CREATE VIEW, DROP VIEW, SHOW VIEWS, INSERT, DESCRIBE, LOAD MODULE, UNLOAD " + "MODULE, USE MODULES, SHOW [FULL] MODULES."; protected TableEnvironmentImpl( CatalogManager catalogManager, ModuleManager moduleManager, TableConfig tableConfig, Executor executor, FunctionCatalog functionCatalog, Planner planner, boolean isStreamingMode, ClassLoader userClassLoader) { this.catalogManager = catalogManager; this.moduleManager = moduleManager; this.execEnv = executor; this.tableConfig = tableConfig; this.functionCatalog = functionCatalog; this.planner = planner; this.isStreamingMode = isStreamingMode; this.userClassLoader = userClassLoader; this.operationTreeBuilder = OperationTreeBuilder.create( tableConfig, functionCatalog.asLookup(getParser()::parseIdentifier), catalogManager.getDataTypeFactory(), path -> { try { UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(path); Optional catalogQueryOperation = scanInternal(unresolvedIdentifier); return catalogQueryOperation.map( t -> ApiExpressionUtils.tableRef(path, t)); } catch (SqlParserException ex) { // The TableLookup is used during resolution of expressions and it // actually might not be an // identifier of a table. It might be a reference to some other // object such as column, local // reference etc. This method should return empty optional in such // cases to fallback for other // identifiers resolution. return Optional.empty(); } }, (sqlExpression, inputRowType, outputType) -> { try { return getParser() .parseSqlExpression( sqlExpression, inputRowType, outputType); } catch (Throwable t) { throw new ValidationException( String.format("Invalid SQL expression: %s", sqlExpression), t); } }, isStreamingMode); catalogManager.initSchemaResolver( isStreamingMode, operationTreeBuilder.getResolverBuilder()); } public static TableEnvironmentImpl create(Configuration configuration) { return create(EnvironmentSettings.fromConfiguration(configuration), configuration); } public static TableEnvironmentImpl create(EnvironmentSettings settings) { return create(settings, settings.toConfiguration()); } private static TableEnvironmentImpl create( EnvironmentSettings settings, Configuration configuration) { // temporary solution until FLINK-15635 is fixed final ClassLoader classLoader = Thread.currentThread().getContextClassLoader(); // use configuration to init table config final TableConfig tableConfig = new TableConfig(); tableConfig.addConfiguration(configuration); final ModuleManager moduleManager = new ModuleManager(); final CatalogManager catalogManager = CatalogManager.newBuilder() .classLoader(classLoader) .config(tableConfig.getConfiguration()) .defaultCatalog( settings.getBuiltInCatalogName(), new GenericInMemoryCatalog( settings.getBuiltInCatalogName(), settings.getBuiltInDatabaseName())) .build(); final FunctionCatalog functionCatalog = new FunctionCatalog(tableConfig, catalogManager, moduleManager); final ExecutorFactory executorFactory = FactoryUtil.discoverFactory( classLoader, ExecutorFactory.class, settings.getExecutor()); final Executor executor = executorFactory.create(configuration); final Planner planner = PlannerFactoryUtil.createPlanner( settings.getPlanner(), executor, tableConfig, catalogManager, functionCatalog); return new TableEnvironmentImpl( catalogManager, moduleManager, tableConfig, executor, functionCatalog, planner, settings.isStreamingMode(), classLoader); } @Override public Table fromValues(Object... values) { return fromValues(Arrays.asList(values)); } @Override public Table fromValues(AbstractDataType rowType, Object... values) { return fromValues(rowType, Arrays.asList(values)); } @Override public Table fromValues(Expression... values) { return createTable(operationTreeBuilder.values(values)); } @Override public Table fromValues(AbstractDataType rowType, Expression... values) { final DataType resolvedDataType = catalogManager.getDataTypeFactory().createDataType(rowType); return createTable(operationTreeBuilder.values(resolvedDataType, values)); } @Override public Table fromValues(Iterable values) { Expression[] exprs = StreamSupport.stream(values.spliterator(), false) .map(ApiExpressionUtils::objectToExpression) .toArray(Expression[]::new); return fromValues(exprs); } @Override public Table fromValues(AbstractDataType rowType, Iterable values) { Expression[] exprs = StreamSupport.stream(values.spliterator(), false) .map(ApiExpressionUtils::objectToExpression) .toArray(Expression[]::new); return fromValues(rowType, exprs); } @VisibleForTesting public Planner getPlanner() { return planner; } @Override public Table fromTableSource(TableSource source) { // only accept StreamTableSource and LookupableTableSource here // TODO should add a validation, while StreamTableSource is in flink-table-api-java-bridge // module now return createTable(new TableSourceQueryOperation<>(source, !IS_STREAM_TABLE)); } @Override public void registerCatalog(String catalogName, Catalog catalog) { catalogManager.registerCatalog(catalogName, catalog); } @Override public Optional getCatalog(String catalogName) { return catalogManager.getCatalog(catalogName); } @Override public void loadModule(String moduleName, Module module) { moduleManager.loadModule(moduleName, module); } @Override public void useModules(String... moduleNames) { moduleManager.useModules(moduleNames); } @Override public void unloadModule(String moduleName) { moduleManager.unloadModule(moduleName); } @Override public void registerFunction(String name, ScalarFunction function) { functionCatalog.registerTempSystemScalarFunction(name, function); } @Override public void createTemporarySystemFunction( String name, Class functionClass) { final UserDefinedFunction functionInstance = UserDefinedFunctionHelper.instantiateFunction(functionClass); createTemporarySystemFunction(name, functionInstance); } @Override public void createTemporarySystemFunction(String name, UserDefinedFunction functionInstance) { functionCatalog.registerTemporarySystemFunction(name, functionInstance, false); } @Override public boolean dropTemporarySystemFunction(String name) { return functionCatalog.dropTemporarySystemFunction(name, true); } @Override public void createFunction(String path, Class functionClass) { createFunction(path, functionClass, false); } @Override public void createFunction( String path, Class functionClass, boolean ignoreIfExists) { final UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(path); functionCatalog.registerCatalogFunction( unresolvedIdentifier, functionClass, ignoreIfExists); } @Override public boolean dropFunction(String path) { final UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(path); return functionCatalog.dropCatalogFunction(unresolvedIdentifier, true); } @Override public void createTemporaryFunction( String path, Class functionClass) { final UserDefinedFunction functionInstance = UserDefinedFunctionHelper.instantiateFunction(functionClass); createTemporaryFunction(path, functionInstance); } @Override public void createTemporaryFunction(String path, UserDefinedFunction functionInstance) { final UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(path); functionCatalog.registerTemporaryCatalogFunction( unresolvedIdentifier, functionInstance, false); } @Override public boolean dropTemporaryFunction(String path) { final UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(path); return functionCatalog.dropTemporaryCatalogFunction(unresolvedIdentifier, true); } @Override public void createTemporaryTable(String path, TableDescriptor descriptor) { Preconditions.checkNotNull(path, "Path must not be null."); Preconditions.checkNotNull(descriptor, "Table descriptor must not be null."); createTemporaryTableInternal(getParser().parseIdentifier(path), descriptor); } private void createTemporaryTableInternal( UnresolvedIdentifier path, TableDescriptor descriptor) { final ObjectIdentifier tableIdentifier = catalogManager.qualifyIdentifier(path); catalogManager.createTemporaryTable(descriptor.toCatalogTable(), tableIdentifier, false); } @Override public void createTable(String path, TableDescriptor descriptor) { Preconditions.checkNotNull(path, "Path must not be null."); Preconditions.checkNotNull(descriptor, "Table descriptor must not be null."); final ObjectIdentifier tableIdentifier = catalogManager.qualifyIdentifier(getParser().parseIdentifier(path)); catalogManager.createTable(descriptor.toCatalogTable(), tableIdentifier, false); } @Override public void registerTable(String name, Table table) { UnresolvedIdentifier identifier = UnresolvedIdentifier.of(name); createTemporaryView(identifier, table); } @Override public void createTemporaryView(String path, Table view) { Preconditions.checkNotNull(path, "Path must not be null."); Preconditions.checkNotNull(view, "Table view must not be null."); UnresolvedIdentifier identifier = getParser().parseIdentifier(path); createTemporaryView(identifier, view); } private void createTemporaryView(UnresolvedIdentifier identifier, Table view) { if (((TableImpl) view).getTableEnvironment() != this) { throw new TableException( "Only table API objects that belong to this TableEnvironment can be registered."); } ObjectIdentifier tableIdentifier = catalogManager.qualifyIdentifier(identifier); QueryOperation queryOperation = qualifyQueryOperation(tableIdentifier, view.getQueryOperation()); CatalogBaseTable tableTable = new QueryOperationCatalogView(queryOperation); catalogManager.createTemporaryTable(tableTable, tableIdentifier, false); } @Override public Table scan(String... tablePath) { UnresolvedIdentifier unresolvedIdentifier = UnresolvedIdentifier.of(tablePath); return scanInternal(unresolvedIdentifier) .map(this::createTable) .orElseThrow( () -> new ValidationException( String.format( "Table %s was not found.", unresolvedIdentifier))); } @Override public Table from(String path) { UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(path); return scanInternal(unresolvedIdentifier) .map(this::createTable) .orElseThrow( () -> new ValidationException( String.format( "Table %s was not found.", unresolvedIdentifier))); } @Override public Table from(TableDescriptor descriptor) { Preconditions.checkNotNull(descriptor, "Table descriptor must not be null."); final String path = TableDescriptorUtil.getUniqueAnonymousPath(); createTemporaryTableInternal(UnresolvedIdentifier.of(path), descriptor); return from(path); } @Override public void insertInto(String targetPath, Table table) { UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(targetPath); insertIntoInternal(unresolvedIdentifier, table); } @Override public void insertInto(Table table, String sinkPath, String... sinkPathContinued) { List fullPath = new ArrayList<>(Arrays.asList(sinkPathContinued)); fullPath.add(0, sinkPath); UnresolvedIdentifier unresolvedIdentifier = UnresolvedIdentifier.of(fullPath); insertIntoInternal(unresolvedIdentifier, table); } private void insertIntoInternal(UnresolvedIdentifier unresolvedIdentifier, Table table) { ObjectIdentifier objectIdentifier = catalogManager.qualifyIdentifier(unresolvedIdentifier); List modifyOperations = Collections.singletonList( new CatalogSinkModifyOperation( objectIdentifier, table.getQueryOperation())); buffer(modifyOperations); } private Optional scanInternal(UnresolvedIdentifier identifier) { ObjectIdentifier tableIdentifier = catalogManager.qualifyIdentifier(identifier); return catalogManager .getTable(tableIdentifier) .map(t -> new CatalogQueryOperation(tableIdentifier, t.getResolvedSchema())); } @Override public String[] listCatalogs() { return catalogManager.listCatalogs().stream().sorted().toArray(String[]::new); } @Override public String[] listModules() { return moduleManager.listModules().toArray(new String[0]); } @Override public ModuleEntry[] listFullModules() { return moduleManager.listFullModules().toArray(new ModuleEntry[0]); } @Override public String[] listDatabases() { return catalogManager .getCatalog(catalogManager.getCurrentCatalog()) .get() .listDatabases() .toArray(new String[0]); } @Override public String[] listTables() { return catalogManager.listTables().stream().sorted().toArray(String[]::new); } @Override public String[] listViews() { return catalogManager.listViews().stream().sorted().toArray(String[]::new); } @Override public String[] listTemporaryTables() { return catalogManager.listTemporaryTables().stream().sorted().toArray(String[]::new); } @Override public String[] listTemporaryViews() { return catalogManager.listTemporaryViews().stream().sorted().toArray(String[]::new); } @Override public boolean dropTemporaryTable(String path) { UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(path); ObjectIdentifier identifier = catalogManager.qualifyIdentifier(unresolvedIdentifier); try { catalogManager.dropTemporaryTable(identifier, false); return true; } catch (ValidationException e) { return false; } } @Override public boolean dropTemporaryView(String path) { UnresolvedIdentifier unresolvedIdentifier = getParser().parseIdentifier(path); ObjectIdentifier identifier = catalogManager.qualifyIdentifier(unresolvedIdentifier); try { catalogManager.dropTemporaryView(identifier, false); return true; } catch (ValidationException e) { return false; } } @Override public String[] listUserDefinedFunctions() { String[] functions = functionCatalog.getUserDefinedFunctions(); Arrays.sort(functions); return functions; } @Override public String[] listFunctions() { String[] functions = functionCatalog.getFunctions(); Arrays.sort(functions); return functions; } @Override public String explain(Table table) { return explain(table, false); } @Override public String explain(Table table, boolean extended) { return planner.explain( Collections.singletonList(table.getQueryOperation()), getExplainDetails(extended)); } @Override public String explain(boolean extended) { List operations = bufferedModifyOperations.stream() .map(o -> (Operation) o) .collect(Collectors.toList()); return planner.explain(operations, getExplainDetails(extended)); } @Override public String explainSql(String statement, ExplainDetail... extraDetails) { List operations = getParser().parse(statement); if (operations.size() != 1) { throw new TableException( "Unsupported SQL query! explainSql() only accepts a single SQL query."); } return explainInternal(operations, extraDetails); } @Override public String explainInternal(List operations, ExplainDetail... extraDetails) { operations = operations.stream() .filter(o -> !(o instanceof NopOperation)) .collect(Collectors.toList()); // hive parser may generate an NopOperation, in which case we just return an // empty string as the plan if (operations.isEmpty()) { return ""; } else { return planner.explain(operations, extraDetails); } } @Override public String[] getCompletionHints(String statement, int position) { return planner.getParser().getCompletionHints(statement, position); } @Override public Table sqlQuery(String query) { List operations = getParser().parse(query); if (operations.size() != 1) { throw new ValidationException( "Unsupported SQL query! sqlQuery() only accepts a single SQL query."); } Operation operation = operations.get(0); if (operation instanceof QueryOperation && !(operation instanceof ModifyOperation)) { return createTable((QueryOperation) operation); } else { throw new ValidationException( "Unsupported SQL query! sqlQuery() only accepts a single SQL query of type " + "SELECT, UNION, INTERSECT, EXCEPT, VALUES, and ORDER_BY."); } } // TODO: ------------ start:二次开发代码 --------------- // private static Method sqlParseMethod = null; private static AtomicBoolean canParse = new AtomicBoolean(true); // TODO: ------------ end:二次开发代码 ----------------- // @Override public TableResult executeSql(String statement) { // TODO: ------------ start:二次开发代码 --------------- // // 使用反射获取进行sql收集,避免api找不到的异常 try { if (canParse.get()) { if (sqlParseMethod == null) { Class clazz = Class.forName("com.zto.fire.flink.sql.FlinkSqlParser"); sqlParseMethod = clazz.getMethod("sqlParse", String.class); sqlParseMethod.setAccessible(true); } if (sqlParseMethod != null) sqlParseMethod.invoke(null, statement); } } catch (Exception e) { try { // 当调用sql解析相关api发生异常时,认为api无法被类加载器所加载,后续将不会尝试调用 canParse.set(false); } catch (Exception e1) {} } // TODO: ------------ end:二次开发代码 ----------------- // List operations = getParser().parse(statement); if (operations.size() != 1) { throw new TableException(UNSUPPORTED_QUERY_IN_EXECUTE_SQL_MSG); } return executeInternal(operations.get(0)); } @Override public StatementSet createStatementSet() { return new StatementSetImpl(this); } @Override public TableResult executeInternal(List operations) { List> transformations = translate(operations); List sinkIdentifierNames = extractSinkIdentifierNames(operations); TableResult result = executeInternal(transformations, sinkIdentifierNames); if (tableConfig.getConfiguration().get(TABLE_DML_SYNC)) { try { result.await(); } catch (InterruptedException | ExecutionException e) { result.getJobClient().ifPresent(JobClient::cancel); throw new TableException("Fail to wait execution finish.", e); } } return result; } private TableResult executeInternal( List> transformations, List sinkIdentifierNames) { final String defaultJobName = "insert-into_" + String.join(",", sinkIdentifierNames); Pipeline pipeline = execEnv.createPipeline( transformations, tableConfig.getConfiguration(), defaultJobName); try { JobClient jobClient = execEnv.executeAsync(pipeline); final List columns = new ArrayList<>(); Object[] affectedRowCounts = new Long[transformations.size()]; for (int i = 0; i < transformations.size(); ++i) { // use sink identifier name as field name columns.add(Column.physical(sinkIdentifierNames.get(i), DataTypes.BIGINT())); affectedRowCounts[i] = -1L; } return TableResultImpl.builder() .jobClient(jobClient) .resultKind(ResultKind.SUCCESS_WITH_CONTENT) .schema(ResolvedSchema.of(columns)) .data( new InsertResultIterator( jobClient, Row.of(affectedRowCounts), userClassLoader)) .build(); } catch (Exception e) { throw new TableException("Failed to execute sql", e); } } private TableResult executeQueryOperation(QueryOperation operation) { final UnresolvedIdentifier unresolvedIdentifier = UnresolvedIdentifier.of( "Unregistered_Collect_Sink_" + CollectModifyOperation.getUniqueId()); final ObjectIdentifier objectIdentifier = catalogManager.qualifyIdentifier(unresolvedIdentifier); CollectModifyOperation sinkOperation = new CollectModifyOperation(objectIdentifier, operation); List> transformations = translate(Collections.singletonList(sinkOperation)); final String defaultJobName = "collect"; Pipeline pipeline = execEnv.createPipeline( transformations, tableConfig.getConfiguration(), defaultJobName); try { JobClient jobClient = execEnv.executeAsync(pipeline); CollectResultProvider resultProvider = sinkOperation.getSelectResultProvider(); resultProvider.setJobClient(jobClient); return TableResultImpl.builder() .jobClient(jobClient) .resultKind(ResultKind.SUCCESS_WITH_CONTENT) .schema(operation.getResolvedSchema()) .data(resultProvider.getResultIterator()) .setPrintStyle( TableResultImpl.PrintStyle.tableau( PrintUtils.MAX_COLUMN_WIDTH, PrintUtils.NULL_COLUMN, true, isStreamingMode)) .setSessionTimeZone(getConfig().getLocalTimeZone()) .build(); } catch (Exception e) { throw new TableException("Failed to execute sql", e); } } @Override public void sqlUpdate(String stmt) { List operations = getParser().parse(stmt); if (operations.size() != 1) { throw new TableException(UNSUPPORTED_QUERY_IN_SQL_UPDATE_MSG); } Operation operation = operations.get(0); if (operation instanceof ModifyOperation) { buffer(Collections.singletonList((ModifyOperation) operation)); } else if (operation instanceof CreateTableOperation || operation instanceof DropTableOperation || operation instanceof AlterTableOperation || operation instanceof CreateViewOperation || operation instanceof DropViewOperation || operation instanceof CreateDatabaseOperation || operation instanceof DropDatabaseOperation || operation instanceof AlterDatabaseOperation || operation instanceof CreateCatalogFunctionOperation || operation instanceof CreateTempSystemFunctionOperation || operation instanceof DropCatalogFunctionOperation || operation instanceof DropTempSystemFunctionOperation || operation instanceof AlterCatalogFunctionOperation || operation instanceof CreateCatalogOperation || operation instanceof DropCatalogOperation || operation instanceof UseCatalogOperation || operation instanceof UseDatabaseOperation || operation instanceof LoadModuleOperation || operation instanceof UnloadModuleOperation || operation instanceof NopOperation) { executeInternal(operation); } else { throw new TableException(UNSUPPORTED_QUERY_IN_SQL_UPDATE_MSG); } } @Override public TableResult executeInternal(Operation operation) { if (operation instanceof ModifyOperation) { return executeInternal(Collections.singletonList((ModifyOperation) operation)); } else if (operation instanceof CreateTableOperation) { CreateTableOperation createTableOperation = (CreateTableOperation) operation; if (createTableOperation.isTemporary()) { catalogManager.createTemporaryTable( createTableOperation.getCatalogTable(), createTableOperation.getTableIdentifier(), createTableOperation.isIgnoreIfExists()); } else { catalogManager.createTable( createTableOperation.getCatalogTable(), createTableOperation.getTableIdentifier(), createTableOperation.isIgnoreIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof DropTableOperation) { DropTableOperation dropTableOperation = (DropTableOperation) operation; if (dropTableOperation.isTemporary()) { catalogManager.dropTemporaryTable( dropTableOperation.getTableIdentifier(), dropTableOperation.isIfExists()); } else { catalogManager.dropTable( dropTableOperation.getTableIdentifier(), dropTableOperation.isIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof AlterTableOperation) { AlterTableOperation alterTableOperation = (AlterTableOperation) operation; Catalog catalog = getCatalogOrThrowException( alterTableOperation.getTableIdentifier().getCatalogName()); String exMsg = getDDLOpExecuteErrorMsg(alterTableOperation.asSummaryString()); try { if (alterTableOperation instanceof AlterTableRenameOperation) { AlterTableRenameOperation alterTableRenameOp = (AlterTableRenameOperation) operation; catalog.renameTable( alterTableRenameOp.getTableIdentifier().toObjectPath(), alterTableRenameOp.getNewTableIdentifier().getObjectName(), false); } else if (alterTableOperation instanceof AlterTableOptionsOperation) { AlterTableOptionsOperation alterTablePropertiesOp = (AlterTableOptionsOperation) operation; catalogManager.alterTable( alterTablePropertiesOp.getCatalogTable(), alterTablePropertiesOp.getTableIdentifier(), false); } else if (alterTableOperation instanceof AlterTableAddConstraintOperation) { AlterTableAddConstraintOperation addConstraintOP = (AlterTableAddConstraintOperation) operation; CatalogTable oriTable = (CatalogTable) catalogManager .getTable(addConstraintOP.getTableIdentifier()) .get() .getTable(); TableSchema.Builder builder = TableSchemaUtils.builderWithGivenSchema(oriTable.getSchema()); if (addConstraintOP.getConstraintName().isPresent()) { builder.primaryKey( addConstraintOP.getConstraintName().get(), addConstraintOP.getColumnNames()); } else { builder.primaryKey(addConstraintOP.getColumnNames()); } CatalogTable newTable = new CatalogTableImpl( builder.build(), oriTable.getPartitionKeys(), oriTable.getOptions(), oriTable.getComment()); catalogManager.alterTable( newTable, addConstraintOP.getTableIdentifier(), false); } else if (alterTableOperation instanceof AlterTableDropConstraintOperation) { AlterTableDropConstraintOperation dropConstraintOperation = (AlterTableDropConstraintOperation) operation; CatalogTable oriTable = (CatalogTable) catalogManager .getTable(dropConstraintOperation.getTableIdentifier()) .get() .getTable(); CatalogTable newTable = new CatalogTableImpl( TableSchemaUtils.dropConstraint( oriTable.getSchema(), dropConstraintOperation.getConstraintName()), oriTable.getPartitionKeys(), oriTable.getOptions(), oriTable.getComment()); catalogManager.alterTable( newTable, dropConstraintOperation.getTableIdentifier(), false); } else if (alterTableOperation instanceof AlterPartitionPropertiesOperation) { AlterPartitionPropertiesOperation alterPartPropsOp = (AlterPartitionPropertiesOperation) operation; catalog.alterPartition( alterPartPropsOp.getTableIdentifier().toObjectPath(), alterPartPropsOp.getPartitionSpec(), alterPartPropsOp.getCatalogPartition(), false); } else if (alterTableOperation instanceof AlterTableSchemaOperation) { AlterTableSchemaOperation alterTableSchemaOperation = (AlterTableSchemaOperation) alterTableOperation; catalogManager.alterTable( alterTableSchemaOperation.getCatalogTable(), alterTableSchemaOperation.getTableIdentifier(), false); } else if (alterTableOperation instanceof AddPartitionsOperation) { AddPartitionsOperation addPartitionsOperation = (AddPartitionsOperation) alterTableOperation; List specs = addPartitionsOperation.getPartitionSpecs(); List partitions = addPartitionsOperation.getCatalogPartitions(); boolean ifNotExists = addPartitionsOperation.ifNotExists(); ObjectPath tablePath = addPartitionsOperation.getTableIdentifier().toObjectPath(); for (int i = 0; i < specs.size(); i++) { catalog.createPartition( tablePath, specs.get(i), partitions.get(i), ifNotExists); } } else if (alterTableOperation instanceof DropPartitionsOperation) { DropPartitionsOperation dropPartitionsOperation = (DropPartitionsOperation) alterTableOperation; ObjectPath tablePath = dropPartitionsOperation.getTableIdentifier().toObjectPath(); boolean ifExists = dropPartitionsOperation.ifExists(); for (CatalogPartitionSpec spec : dropPartitionsOperation.getPartitionSpecs()) { catalog.dropPartition(tablePath, spec, ifExists); } } return TableResultImpl.TABLE_RESULT_OK; } catch (TableAlreadyExistException | TableNotExistException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof CreateViewOperation) { CreateViewOperation createViewOperation = (CreateViewOperation) operation; if (createViewOperation.isTemporary()) { catalogManager.createTemporaryTable( createViewOperation.getCatalogView(), createViewOperation.getViewIdentifier(), createViewOperation.isIgnoreIfExists()); } else { catalogManager.createTable( createViewOperation.getCatalogView(), createViewOperation.getViewIdentifier(), createViewOperation.isIgnoreIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof DropViewOperation) { DropViewOperation dropViewOperation = (DropViewOperation) operation; if (dropViewOperation.isTemporary()) { catalogManager.dropTemporaryView( dropViewOperation.getViewIdentifier(), dropViewOperation.isIfExists()); } else { catalogManager.dropView( dropViewOperation.getViewIdentifier(), dropViewOperation.isIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof AlterViewOperation) { AlterViewOperation alterViewOperation = (AlterViewOperation) operation; Catalog catalog = getCatalogOrThrowException( alterViewOperation.getViewIdentifier().getCatalogName()); String exMsg = getDDLOpExecuteErrorMsg(alterViewOperation.asSummaryString()); try { if (alterViewOperation instanceof AlterViewRenameOperation) { AlterViewRenameOperation alterTableRenameOp = (AlterViewRenameOperation) operation; catalog.renameTable( alterTableRenameOp.getViewIdentifier().toObjectPath(), alterTableRenameOp.getNewViewIdentifier().getObjectName(), false); } else if (alterViewOperation instanceof AlterViewPropertiesOperation) { AlterViewPropertiesOperation alterTablePropertiesOp = (AlterViewPropertiesOperation) operation; catalogManager.alterTable( alterTablePropertiesOp.getCatalogView(), alterTablePropertiesOp.getViewIdentifier(), false); } else if (alterViewOperation instanceof AlterViewAsOperation) { AlterViewAsOperation alterViewAsOperation = (AlterViewAsOperation) alterViewOperation; catalogManager.alterTable( alterViewAsOperation.getNewView(), alterViewAsOperation.getViewIdentifier(), false); } return TableResultImpl.TABLE_RESULT_OK; } catch (TableAlreadyExistException | TableNotExistException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof CreateDatabaseOperation) { CreateDatabaseOperation createDatabaseOperation = (CreateDatabaseOperation) operation; Catalog catalog = getCatalogOrThrowException(createDatabaseOperation.getCatalogName()); String exMsg = getDDLOpExecuteErrorMsg(createDatabaseOperation.asSummaryString()); try { catalog.createDatabase( createDatabaseOperation.getDatabaseName(), createDatabaseOperation.getCatalogDatabase(), createDatabaseOperation.isIgnoreIfExists()); return TableResultImpl.TABLE_RESULT_OK; } catch (DatabaseAlreadyExistException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof DropDatabaseOperation) { DropDatabaseOperation dropDatabaseOperation = (DropDatabaseOperation) operation; Catalog catalog = getCatalogOrThrowException(dropDatabaseOperation.getCatalogName()); String exMsg = getDDLOpExecuteErrorMsg(dropDatabaseOperation.asSummaryString()); try { catalog.dropDatabase( dropDatabaseOperation.getDatabaseName(), dropDatabaseOperation.isIfExists(), dropDatabaseOperation.isCascade()); return TableResultImpl.TABLE_RESULT_OK; } catch (DatabaseNotExistException | DatabaseNotEmptyException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof AlterDatabaseOperation) { AlterDatabaseOperation alterDatabaseOperation = (AlterDatabaseOperation) operation; Catalog catalog = getCatalogOrThrowException(alterDatabaseOperation.getCatalogName()); String exMsg = getDDLOpExecuteErrorMsg(alterDatabaseOperation.asSummaryString()); try { catalog.alterDatabase( alterDatabaseOperation.getDatabaseName(), alterDatabaseOperation.getCatalogDatabase(), false); return TableResultImpl.TABLE_RESULT_OK; } catch (DatabaseNotExistException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof CreateCatalogFunctionOperation) { return createCatalogFunction((CreateCatalogFunctionOperation) operation); } else if (operation instanceof CreateTempSystemFunctionOperation) { return createSystemFunction((CreateTempSystemFunctionOperation) operation); } else if (operation instanceof DropCatalogFunctionOperation) { return dropCatalogFunction((DropCatalogFunctionOperation) operation); } else if (operation instanceof DropTempSystemFunctionOperation) { return dropSystemFunction((DropTempSystemFunctionOperation) operation); } else if (operation instanceof AlterCatalogFunctionOperation) { return alterCatalogFunction((AlterCatalogFunctionOperation) operation); } else if (operation instanceof CreateCatalogOperation) { return createCatalog((CreateCatalogOperation) operation); } else if (operation instanceof DropCatalogOperation) { DropCatalogOperation dropCatalogOperation = (DropCatalogOperation) operation; String exMsg = getDDLOpExecuteErrorMsg(dropCatalogOperation.asSummaryString()); try { catalogManager.unregisterCatalog( dropCatalogOperation.getCatalogName(), dropCatalogOperation.isIfExists()); return TableResultImpl.TABLE_RESULT_OK; } catch (CatalogException e) { throw new ValidationException(exMsg, e); } } else if (operation instanceof LoadModuleOperation) { return loadModule((LoadModuleOperation) operation); } else if (operation instanceof UnloadModuleOperation) { return unloadModule((UnloadModuleOperation) operation); } else if (operation instanceof UseModulesOperation) { return useModules((UseModulesOperation) operation); } else if (operation instanceof UseCatalogOperation) { UseCatalogOperation useCatalogOperation = (UseCatalogOperation) operation; catalogManager.setCurrentCatalog(useCatalogOperation.getCatalogName()); return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof UseDatabaseOperation) { UseDatabaseOperation useDatabaseOperation = (UseDatabaseOperation) operation; catalogManager.setCurrentCatalog(useDatabaseOperation.getCatalogName()); catalogManager.setCurrentDatabase(useDatabaseOperation.getDatabaseName()); return TableResultImpl.TABLE_RESULT_OK; } else if (operation instanceof ShowCatalogsOperation) { return buildShowResult("catalog name", listCatalogs()); } else if (operation instanceof ShowCreateTableOperation) { ShowCreateTableOperation showCreateTableOperation = (ShowCreateTableOperation) operation; Optional result = catalogManager.getTable(showCreateTableOperation.getTableIdentifier()); if (result.isPresent()) { return TableResultImpl.builder() .resultKind(ResultKind.SUCCESS_WITH_CONTENT) .schema(ResolvedSchema.of(Column.physical("result", DataTypes.STRING()))) .data( Collections.singletonList( Row.of( buildShowCreateTableRow( result.get().getResolvedTable(), showCreateTableOperation .getTableIdentifier(), result.get().isTemporary())))) .setPrintStyle(TableResultImpl.PrintStyle.rawContent()) .build(); } else { throw new ValidationException( String.format( "Could not execute SHOW CREATE TABLE. Table with identifier %s does not exist.", showCreateTableOperation .getTableIdentifier() .asSerializableString())); } } else if (operation instanceof ShowCurrentCatalogOperation) { return buildShowResult( "current catalog name", new String[] {catalogManager.getCurrentCatalog()}); } else if (operation instanceof ShowDatabasesOperation) { return buildShowResult("database name", listDatabases()); } else if (operation instanceof ShowCurrentDatabaseOperation) { return buildShowResult( "current database name", new String[] {catalogManager.getCurrentDatabase()}); } else if (operation instanceof ShowModulesOperation) { ShowModulesOperation showModulesOperation = (ShowModulesOperation) operation; if (showModulesOperation.requireFull()) { return buildShowFullModulesResult(listFullModules()); } else { return buildShowResult("module name", listModules()); } } else if (operation instanceof ShowTablesOperation) { return buildShowResult("table name", listTables()); } else if (operation instanceof ShowFunctionsOperation) { ShowFunctionsOperation showFunctionsOperation = (ShowFunctionsOperation) operation; String[] functionNames = null; ShowFunctionsOperation.FunctionScope functionScope = showFunctionsOperation.getFunctionScope(); switch (functionScope) { case USER: functionNames = listUserDefinedFunctions(); break; case ALL: functionNames = listFunctions(); break; default: throw new UnsupportedOperationException( String.format( "SHOW FUNCTIONS with %s scope is not supported.", functionScope)); } return buildShowResult("function name", functionNames); } else if (operation instanceof ShowViewsOperation) { return buildShowResult("view name", listViews()); } else if (operation instanceof ShowPartitionsOperation) { String exMsg = getDDLOpExecuteErrorMsg(operation.asSummaryString()); try { ShowPartitionsOperation showPartitionsOperation = (ShowPartitionsOperation) operation; Catalog catalog = getCatalogOrThrowException( showPartitionsOperation.getTableIdentifier().getCatalogName()); ObjectPath tablePath = showPartitionsOperation.getTableIdentifier().toObjectPath(); CatalogPartitionSpec partitionSpec = showPartitionsOperation.getPartitionSpec(); List partitionSpecs = partitionSpec == null ? catalog.listPartitions(tablePath) : catalog.listPartitions(tablePath, partitionSpec); List partitionNames = new ArrayList<>(partitionSpecs.size()); for (CatalogPartitionSpec spec : partitionSpecs) { List partitionKVs = new ArrayList<>(spec.getPartitionSpec().size()); for (Map.Entry partitionKV : spec.getPartitionSpec().entrySet()) { partitionKVs.add(partitionKV.getKey() + "=" + partitionKV.getValue()); } partitionNames.add(String.join("/", partitionKVs)); } return buildShowResult("partition name", partitionNames.toArray(new String[0])); } catch (TableNotExistException e) { throw new ValidationException(exMsg, e); } catch (Exception e) { throw new TableException(exMsg, e); } } else if (operation instanceof ExplainOperation) { ExplainOperation explainOperation = (ExplainOperation) operation; ExplainDetail[] explainDetails = explainOperation.getExplainDetails().stream() .map(ExplainDetail::valueOf) .toArray(ExplainDetail[]::new); String explanation = explainInternal( Collections.singletonList(((ExplainOperation) operation).getChild()), explainDetails); return TableResultImpl.builder() .resultKind(ResultKind.SUCCESS_WITH_CONTENT) .schema(ResolvedSchema.of(Column.physical("result", DataTypes.STRING()))) .data(Collections.singletonList(Row.of(explanation))) .setPrintStyle(TableResultImpl.PrintStyle.rawContent()) .setSessionTimeZone(getConfig().getLocalTimeZone()) .build(); } else if (operation instanceof DescribeTableOperation) { DescribeTableOperation describeTableOperation = (DescribeTableOperation) operation; Optional result = catalogManager.getTable(describeTableOperation.getSqlIdentifier()); if (result.isPresent()) { return buildDescribeResult(result.get().getResolvedSchema()); } else { throw new ValidationException( String.format( "Tables or views with the identifier '%s' doesn't exist", describeTableOperation.getSqlIdentifier().asSummaryString())); } } else if (operation instanceof QueryOperation) { return executeQueryOperation((QueryOperation) operation); } else if (operation instanceof CreateTableASOperation) { executeInternal(((CreateTableASOperation) operation).getCreateTableOperation()); return executeInternal(((CreateTableASOperation) operation).getInsertOperation()); } else if (operation instanceof NopOperation) { return TableResultImpl.TABLE_RESULT_OK; } else { throw new TableException(UNSUPPORTED_QUERY_IN_EXECUTE_SQL_MSG); } } private TableResult createCatalog(CreateCatalogOperation operation) { String exMsg = getDDLOpExecuteErrorMsg(operation.asSummaryString()); try { String catalogName = operation.getCatalogName(); Map properties = operation.getProperties(); Catalog catalog = FactoryUtil.createCatalog( catalogName, properties, tableConfig.getConfiguration(), userClassLoader); catalogManager.registerCatalog(catalogName, catalog); return TableResultImpl.TABLE_RESULT_OK; } catch (CatalogException e) { throw new ValidationException(exMsg, e); } } private TableResult loadModule(LoadModuleOperation operation) { final String exMsg = getDDLOpExecuteErrorMsg(operation.asSummaryString()); try { final Module module = FactoryUtil.createModule( operation.getModuleName(), operation.getOptions(), tableConfig.getConfiguration(), userClassLoader); moduleManager.loadModule(operation.getModuleName(), module); return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw new ValidationException(String.format("%s. %s", exMsg, e.getMessage()), e); } catch (Exception e) { throw new TableException(String.format("%s. %s", exMsg, e.getMessage()), e); } } private TableResult unloadModule(UnloadModuleOperation operation) { String exMsg = getDDLOpExecuteErrorMsg(operation.asSummaryString()); try { moduleManager.unloadModule(operation.getModuleName()); return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw new ValidationException(String.format("%s. %s", exMsg, e.getMessage()), e); } } private TableResult useModules(UseModulesOperation operation) { String exMsg = getDDLOpExecuteErrorMsg(operation.asSummaryString()); try { moduleManager.useModules(operation.getModuleNames().toArray(new String[0])); return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw new ValidationException(String.format("%s. %s", exMsg, e.getMessage()), e); } } private TableResult buildShowResult(String columnName, String[] objects) { return buildResult( new String[] {columnName}, new DataType[] {DataTypes.STRING()}, Arrays.stream(objects).map((c) -> new String[] {c}).toArray(String[][]::new)); } private String buildShowCreateTableRow( ResolvedCatalogBaseTable table, ObjectIdentifier tableIdentifier, boolean isTemporary) { final String printIndent = " "; CatalogBaseTable.TableKind kind = table.getTableKind(); if (kind == CatalogBaseTable.TableKind.VIEW) { throw new TableException( String.format( "SHOW CREATE TABLE does not support showing CREATE VIEW statement with identifier %s.", tableIdentifier.asSerializableString())); } StringBuilder sb = new StringBuilder( String.format( "CREATE %sTABLE %s (\n", isTemporary ? "TEMPORARY " : "", tableIdentifier.asSerializableString())); ResolvedSchema schema = table.getResolvedSchema(); // append columns sb.append( schema.getColumns().stream() .map(column -> String.format("%s%s", printIndent, getColumnString(column))) .collect(Collectors.joining(",\n"))); // append watermark spec if (!schema.getWatermarkSpecs().isEmpty()) { sb.append(",\n"); sb.append( schema.getWatermarkSpecs().stream() .map( watermarkSpec -> String.format( "%sWATERMARK FOR %s AS %s", printIndent, EncodingUtils.escapeIdentifier( watermarkSpec.getRowtimeAttribute()), watermarkSpec .getWatermarkExpression() .asSerializableString())) .collect(Collectors.joining("\n"))); } // append constraint if (schema.getPrimaryKey().isPresent()) { sb.append(",\n"); sb.append(String.format("%s%s", printIndent, schema.getPrimaryKey().get())); } sb.append("\n) "); // append comment String comment = table.getComment(); if (StringUtils.isNotEmpty(comment)) { sb.append(String.format("COMMENT '%s'\n", comment)); } // append partitions ResolvedCatalogTable catalogTable = (ResolvedCatalogTable) table; if (catalogTable.isPartitioned()) { sb.append("PARTITIONED BY (") .append( catalogTable.getPartitionKeys().stream() .map(EncodingUtils::escapeIdentifier) .collect(Collectors.joining(", "))) .append(")\n"); } // append `with` properties Map options = table.getOptions(); sb.append("WITH (\n") .append( options.entrySet().stream() .map( entry -> String.format( "%s'%s' = '%s'", printIndent, entry.getKey(), entry.getValue())) .collect(Collectors.joining(",\n"))) .append("\n)\n"); return sb.toString(); } private String getColumnString(Column column) { final StringBuilder sb = new StringBuilder(); sb.append(EncodingUtils.escapeIdentifier(column.getName())); sb.append(" "); // skip data type for computed column if (column instanceof Column.ComputedColumn) { sb.append( column.explainExtras() .orElseThrow( () -> new TableException( String.format( "Column expression can not be null for computed column '%s'", column.getName())))); } else { sb.append(column.getDataType().getLogicalType().asSerializableString()); column.explainExtras() .ifPresent( e -> { sb.append(" "); sb.append(e); }); } // TODO: Print the column comment until FLINK-18958 is fixed return sb.toString(); } private TableResult buildShowFullModulesResult(ModuleEntry[] moduleEntries) { Object[][] rows = Arrays.stream(moduleEntries) .map(entry -> new Object[] {entry.name(), entry.used()}) .toArray(Object[][]::new); return buildResult( new String[] {"module name", "used"}, new DataType[] {DataTypes.STRING(), DataTypes.BOOLEAN()}, rows); } private TableResult buildDescribeResult(ResolvedSchema schema) { Map fieldToWatermark = schema.getWatermarkSpecs().stream() .collect( Collectors.toMap( WatermarkSpec::getRowtimeAttribute, spec -> spec.getWatermarkExpression().asSummaryString())); Map fieldToPrimaryKey = new HashMap<>(); schema.getPrimaryKey() .ifPresent( (p) -> { List columns = p.getColumns(); columns.forEach( (c) -> fieldToPrimaryKey.put( c, String.format( "PRI(%s)", String.join(", ", columns)))); }); Object[][] rows = schema.getColumns().stream() .map( (c) -> { final LogicalType logicalType = c.getDataType().getLogicalType(); return new Object[] { c.getName(), logicalType.copy(true).asSummaryString(), logicalType.isNullable(), fieldToPrimaryKey.getOrDefault(c.getName(), null), c.explainExtras().orElse(null), fieldToWatermark.getOrDefault(c.getName(), null) }; }) .toArray(Object[][]::new); return buildResult( new String[] {"name", "type", "null", "key", "extras", "watermark"}, new DataType[] { DataTypes.STRING(), DataTypes.STRING(), DataTypes.BOOLEAN(), DataTypes.STRING(), DataTypes.STRING(), DataTypes.STRING() }, rows); } private TableResult buildResult(String[] headers, DataType[] types, Object[][] rows) { return TableResultImpl.builder() .resultKind(ResultKind.SUCCESS_WITH_CONTENT) .schema(ResolvedSchema.physical(headers, types)) .data(Arrays.stream(rows).map(Row::of).collect(Collectors.toList())) .setPrintStyle( TableResultImpl.PrintStyle.tableau(Integer.MAX_VALUE, "", false, false)) .setSessionTimeZone(getConfig().getLocalTimeZone()) .build(); } /** * extract sink identifier names from {@link ModifyOperation}s. * *

If there are multiple ModifyOperations have same name, an index suffix will be added at * the end of the name to ensure each name is unique. */ private List extractSinkIdentifierNames(List operations) { List tableNames = new ArrayList<>(operations.size()); Map tableNameToCount = new HashMap<>(); for (ModifyOperation operation : operations) { if (operation instanceof CatalogSinkModifyOperation) { ObjectIdentifier identifier = ((CatalogSinkModifyOperation) operation).getTableIdentifier(); String fullName = identifier.asSummaryString(); tableNames.add(fullName); tableNameToCount.put(fullName, tableNameToCount.getOrDefault(fullName, 0) + 1); } else { throw new UnsupportedOperationException("Unsupported operation: " + operation); } } Map tableNameToIndex = new HashMap<>(); return tableNames.stream() .map( tableName -> { if (tableNameToCount.get(tableName) == 1) { return tableName; } else { Integer index = tableNameToIndex.getOrDefault(tableName, 0) + 1; tableNameToIndex.put(tableName, index); return tableName + "_" + index; } }) .collect(Collectors.toList()); } /** Get catalog from catalogName or throw a ValidationException if the catalog not exists. */ private Catalog getCatalogOrThrowException(String catalogName) { return getCatalog(catalogName) .orElseThrow( () -> new ValidationException( String.format("Catalog %s does not exist", catalogName))); } private String getDDLOpExecuteErrorMsg(String action) { return String.format("Could not execute %s", action); } @Override public String getCurrentCatalog() { return catalogManager.getCurrentCatalog(); } @Override public void useCatalog(String catalogName) { catalogManager.setCurrentCatalog(catalogName); } @Override public String getCurrentDatabase() { return catalogManager.getCurrentDatabase(); } @Override public void useDatabase(String databaseName) { catalogManager.setCurrentDatabase(databaseName); } @Override public TableConfig getConfig() { return tableConfig; } @Override public JobExecutionResult execute(String jobName) throws Exception { Pipeline pipeline = execEnv.createPipeline( translateAndClearBuffer(), tableConfig.getConfiguration(), jobName); return execEnv.execute(pipeline); } @Override public Parser getParser() { return getPlanner().getParser(); } @Override public CatalogManager getCatalogManager() { return catalogManager; } @Override public OperationTreeBuilder getOperationTreeBuilder() { return operationTreeBuilder; } /** * Subclasses can override this method to transform the given QueryOperation to a new one with * the qualified object identifier. This is needed for some QueryOperations, e.g. * JavaDataStreamQueryOperation, which doesn't know the registered identifier when created * ({@code fromDataStream(DataStream)}. But the identifier is required when converting this * QueryOperation to RelNode. */ protected QueryOperation qualifyQueryOperation( ObjectIdentifier identifier, QueryOperation queryOperation) { return queryOperation; } /** * Subclasses can override this method to add additional checks. * * @param tableSource tableSource to validate */ protected void validateTableSource(TableSource tableSource) { TableSourceValidation.validateTableSource(tableSource, tableSource.getTableSchema()); } /** * Translate the buffered operations to Transformations, and clear the buffer. * *

The buffer will be clear even if the `translate` fails. In most cases, the failure is not * retryable (e.g. type mismatch, can't generate physical plan). If the buffer is not clear * after failure, the following `translate` will also fail. */ protected List> translateAndClearBuffer() { List> transformations; try { transformations = translate(bufferedModifyOperations); } finally { bufferedModifyOperations.clear(); } return transformations; } protected List> translate(List modifyOperations) { return planner.translate(modifyOperations); } private void buffer(List modifyOperations) { bufferedModifyOperations.addAll(modifyOperations); } @VisibleForTesting protected ExplainDetail[] getExplainDetails(boolean extended) { if (extended) { if (isStreamingMode) { return new ExplainDetail[] { ExplainDetail.ESTIMATED_COST, ExplainDetail.CHANGELOG_MODE }; } else { return new ExplainDetail[] {ExplainDetail.ESTIMATED_COST}; } } else { return new ExplainDetail[0]; } } @Override public void registerTableSourceInternal(String name, TableSource tableSource) { validateTableSource(tableSource); ObjectIdentifier objectIdentifier = catalogManager.qualifyIdentifier(UnresolvedIdentifier.of(name)); Optional table = getTemporaryTable(objectIdentifier); if (table.isPresent()) { if (table.get() instanceof ConnectorCatalogTable) { ConnectorCatalogTable sourceSinkTable = (ConnectorCatalogTable) table.get(); if (sourceSinkTable.getTableSource().isPresent()) { throw new ValidationException( String.format( "Table '%s' already exists. Please choose a different name.", name)); } else { // wrapper contains only sink (not source) ConnectorCatalogTable sourceAndSink = ConnectorCatalogTable.sourceAndSink( tableSource, sourceSinkTable.getTableSink().get(), !IS_STREAM_TABLE); catalogManager.dropTemporaryTable(objectIdentifier, false); catalogManager.createTemporaryTable(sourceAndSink, objectIdentifier, false); } } else { throw new ValidationException( String.format( "Table '%s' already exists. Please choose a different name.", name)); } } else { ConnectorCatalogTable source = ConnectorCatalogTable.source(tableSource, !IS_STREAM_TABLE); catalogManager.createTemporaryTable(source, objectIdentifier, false); } } @Override public void registerTableSinkInternal(String name, TableSink tableSink) { ObjectIdentifier objectIdentifier = catalogManager.qualifyIdentifier(UnresolvedIdentifier.of(name)); Optional table = getTemporaryTable(objectIdentifier); if (table.isPresent()) { if (table.get() instanceof ConnectorCatalogTable) { ConnectorCatalogTable sourceSinkTable = (ConnectorCatalogTable) table.get(); if (sourceSinkTable.getTableSink().isPresent()) { throw new ValidationException( String.format( "Table '%s' already exists. Please choose a different name.", name)); } else { // wrapper contains only sink (not source) ConnectorCatalogTable sourceAndSink = ConnectorCatalogTable.sourceAndSink( sourceSinkTable.getTableSource().get(), tableSink, !IS_STREAM_TABLE); catalogManager.dropTemporaryTable(objectIdentifier, false); catalogManager.createTemporaryTable(sourceAndSink, objectIdentifier, false); } } else { throw new ValidationException( String.format( "Table '%s' already exists. Please choose a different name.", name)); } } else { ConnectorCatalogTable sink = ConnectorCatalogTable.sink(tableSink, !IS_STREAM_TABLE); catalogManager.createTemporaryTable(sink, objectIdentifier, false); } } private Optional getTemporaryTable(ObjectIdentifier identifier) { return catalogManager .getTable(identifier) .filter(CatalogManager.TableLookupResult::isTemporary) .map(CatalogManager.TableLookupResult::getTable); } private TableResult createCatalogFunction( CreateCatalogFunctionOperation createCatalogFunctionOperation) { String exMsg = getDDLOpExecuteErrorMsg(createCatalogFunctionOperation.asSummaryString()); try { if (createCatalogFunctionOperation.isTemporary()) { functionCatalog.registerTemporaryCatalogFunction( UnresolvedIdentifier.of( createCatalogFunctionOperation.getFunctionIdentifier().toList()), createCatalogFunctionOperation.getCatalogFunction(), createCatalogFunctionOperation.isIgnoreIfExists()); } else { Catalog catalog = getCatalogOrThrowException( createCatalogFunctionOperation .getFunctionIdentifier() .getCatalogName()); catalog.createFunction( createCatalogFunctionOperation.getFunctionIdentifier().toObjectPath(), createCatalogFunctionOperation.getCatalogFunction(), createCatalogFunctionOperation.isIgnoreIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw e; } catch (FunctionAlreadyExistException e) { throw new ValidationException(e.getMessage(), e); } catch (Exception e) { throw new TableException(exMsg, e); } } private TableResult alterCatalogFunction( AlterCatalogFunctionOperation alterCatalogFunctionOperation) { String exMsg = getDDLOpExecuteErrorMsg(alterCatalogFunctionOperation.asSummaryString()); try { CatalogFunction function = alterCatalogFunctionOperation.getCatalogFunction(); if (alterCatalogFunctionOperation.isTemporary()) { throw new ValidationException("Alter temporary catalog function is not supported"); } else { Catalog catalog = getCatalogOrThrowException( alterCatalogFunctionOperation .getFunctionIdentifier() .getCatalogName()); catalog.alterFunction( alterCatalogFunctionOperation.getFunctionIdentifier().toObjectPath(), function, alterCatalogFunctionOperation.isIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw e; } catch (FunctionNotExistException e) { throw new ValidationException(e.getMessage(), e); } catch (Exception e) { throw new TableException(exMsg, e); } } private TableResult dropCatalogFunction( DropCatalogFunctionOperation dropCatalogFunctionOperation) { String exMsg = getDDLOpExecuteErrorMsg(dropCatalogFunctionOperation.asSummaryString()); try { if (dropCatalogFunctionOperation.isTemporary()) { functionCatalog.dropTempCatalogFunction( dropCatalogFunctionOperation.getFunctionIdentifier(), dropCatalogFunctionOperation.isIfExists()); } else { Catalog catalog = getCatalogOrThrowException( dropCatalogFunctionOperation .getFunctionIdentifier() .getCatalogName()); catalog.dropFunction( dropCatalogFunctionOperation.getFunctionIdentifier().toObjectPath(), dropCatalogFunctionOperation.isIfExists()); } return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw e; } catch (FunctionNotExistException e) { throw new ValidationException(e.getMessage(), e); } catch (Exception e) { throw new TableException(exMsg, e); } } private TableResult createSystemFunction(CreateTempSystemFunctionOperation operation) { String exMsg = getDDLOpExecuteErrorMsg(operation.asSummaryString()); try { functionCatalog.registerTemporarySystemFunction( operation.getFunctionName(), operation.getCatalogFunction(), operation.isIgnoreIfExists()); return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw e; } catch (Exception e) { throw new TableException(exMsg, e); } } private TableResult dropSystemFunction(DropTempSystemFunctionOperation operation) { try { functionCatalog.dropTemporarySystemFunction( operation.getFunctionName(), operation.isIfExists()); return TableResultImpl.TABLE_RESULT_OK; } catch (ValidationException e) { throw e; } catch (Exception e) { throw new TableException(getDDLOpExecuteErrorMsg(operation.asSummaryString()), e); } } protected TableImpl createTable(QueryOperation tableOperation) { return TableImpl.createTable( this, tableOperation, operationTreeBuilder, functionCatalog.asLookup(getParser()::parseIdentifier)); } @Override public String getJsonPlan(String stmt) { List operations = getParser().parse(stmt); if (operations.size() != 1) { throw new TableException( "Unsupported SQL query! getJsonPlan() only accepts a single INSERT statement."); } Operation operation = operations.get(0); List modifyOperations = new ArrayList<>(1); if (operation instanceof ModifyOperation) { modifyOperations.add((ModifyOperation) operation); } else { throw new TableException("Only INSERT is supported now."); } return getJsonPlan(modifyOperations); } @Override public String getJsonPlan(List operations) { return planner.getJsonPlan(operations); } @Override public String explainJsonPlan(String jsonPlan, ExplainDetail... extraDetails) { return planner.explainJsonPlan(jsonPlan, extraDetails); } @Override public TableResult executeJsonPlan(String jsonPlan) { List> transformations = planner.translateJsonPlan(jsonPlan); List sinkIdentifierNames = new ArrayList<>(); for (int i = 0; i < transformations.size(); ++i) { // TODO serialize the sink table names to json plan ? sinkIdentifierNames.add("sink" + i); } return executeInternal(transformations, sinkIdentifierNames); } } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.14/org/apache/flink/util/ExceptionUtils.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // // The function "stringifyException" is based on source code from the Hadoop Project // (http://hadoop.apache.org/), // licensed by the Apache Software Foundation (ASF) under the Apache License, Version 2.0. // See the NOTICE file distributed with this work for additional information regarding copyright // ownership. // package org.apache.flink.util; import com.zto.fire.common.util.ExceptionBus; import org.apache.flink.annotation.Internal; import org.apache.flink.util.function.RunnableWithException; import javax.annotation.Nullable; import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; import java.lang.reflect.Field; import java.util.Optional; import java.util.concurrent.CompletionException; import java.util.concurrent.ExecutionException; import java.util.function.Function; import java.util.function.Predicate; import static org.apache.flink.util.Preconditions.checkNotNull; /** A collection of utility functions for dealing with exceptions and exception workflows. */ @Internal public final class ExceptionUtils { /** The stringified representation of a null exception reference. */ public static final String STRINGIFIED_NULL_EXCEPTION = "(null)"; // TODO: ------------ start:二次开发代码 --------------- // /** * Makes a string representation of the exception's stack trace, or "(null)", if the exception * is null. * *

This method makes a best effort and never fails. * * @param e The exception to stringify. * @return A string with exception name and call stack. */ public static String stringifyException(final Throwable e) { return stringifyException(e, ""); } /** * Makes a string representation of the exception's stack trace, or "(null)", if the exception * is null. * *

This method makes a best effort and never fails. * * @param e The exception to stringify. * @return A string with exception name and call stack. */ public static String stringifyException(final Throwable e, String sql) { if (e == null) { return STRINGIFIED_NULL_EXCEPTION; } try { StringWriter stm = new StringWriter(); PrintWriter wrt = new PrintWriter(stm); e.printStackTrace(wrt); wrt.close(); ExceptionBus.post(e, sql); return stm.toString(); } catch (Throwable t) { return e.getClass().getName() + " (error while printing stack trace)"; } } // TODO: ------------ end:二次开发代码 --------------- // /** * Checks whether the given exception indicates a situation that may leave the JVM in a * corrupted state, meaning a state where continued normal operation can only be guaranteed via * clean process restart. * *

Currently considered fatal exceptions are Virtual Machine errors indicating that the JVM * is corrupted, like {@link InternalError}, {@link UnknownError}, and {@link * java.util.zip.ZipError} (a special case of InternalError). The {@link ThreadDeath} exception * is also treated as a fatal error, because when a thread is forcefully stopped, there is a * high chance that parts of the system are in an inconsistent state. * * @param t The exception to check. * @return True, if the exception is considered fatal to the JVM, false otherwise. */ public static boolean isJvmFatalError(Throwable t) { return (t instanceof InternalError) || (t instanceof UnknownError) || (t instanceof ThreadDeath); } /** * Checks whether the given exception indicates a situation that may leave the JVM in a * corrupted state, or an out-of-memory error. * *

See {@link ExceptionUtils#isJvmFatalError(Throwable)} for a list of fatal JVM errors. This * method additionally classifies the {@link OutOfMemoryError} as fatal, because it may occur in * any thread (not the one that allocated the majority of the memory) and thus is often not * recoverable by destroying the particular thread that threw the exception. * * @param t The exception to check. * @return True, if the exception is fatal to the JVM or and OutOfMemoryError, false otherwise. */ public static boolean isJvmFatalOrOutOfMemoryError(Throwable t) { return isJvmFatalError(t) || t instanceof OutOfMemoryError; } /** * Tries to enrich OutOfMemoryErrors being part of the passed root Throwable's cause tree. * *

This method improves error messages for direct and metaspace {@link OutOfMemoryError}. It * adds description about the possible causes and ways of resolution. * * @param root The Throwable of which the cause tree shall be traversed. * @param jvmMetaspaceOomNewErrorMessage The message being used for JVM metaspace-related * OutOfMemoryErrors. Passing null will disable handling this class of error. * @param jvmDirectOomNewErrorMessage The message being used for direct memory-related * OutOfMemoryErrors. Passing null will disable handling this class of error. * @param jvmHeapSpaceOomNewErrorMessage The message being used for Heap space-related * OutOfMemoryErrors. Passing null will disable handling this class of error. */ public static void tryEnrichOutOfMemoryError( @Nullable Throwable root, @Nullable String jvmMetaspaceOomNewErrorMessage, @Nullable String jvmDirectOomNewErrorMessage, @Nullable String jvmHeapSpaceOomNewErrorMessage) { updateDetailMessage( root, t -> { if (isMetaspaceOutOfMemoryError(t)) { return jvmMetaspaceOomNewErrorMessage; } else if (isDirectOutOfMemoryError(t)) { return jvmDirectOomNewErrorMessage; } else if (isHeapSpaceOutOfMemoryError(t)) { return jvmHeapSpaceOomNewErrorMessage; } return null; }); } /** * Updates error messages of Throwables appearing in the cause tree of the passed root * Throwable. The passed Function is applied on each Throwable of the cause tree. Returning a * String will cause the detailMessage of the corresponding Throwable to be updated. Returning * null, instead, won't trigger any detailMessage update on that Throwable. * * @param root The Throwable whose cause tree shall be traversed. * @param throwableToMessage The Function based on which the new messages are generated. The * function implementation should return the new message. Returning null, in * contrast, will result in not updating the message for the corresponding Throwable. */ public static void updateDetailMessage( @Nullable Throwable root, @Nullable Function throwableToMessage) { if (throwableToMessage == null) { return; } Throwable it = root; while (it != null) { String newMessage = throwableToMessage.apply(it); if (newMessage != null) { updateDetailMessageOfThrowable(it, newMessage); } it = it.getCause(); } } private static void updateDetailMessageOfThrowable( Throwable throwable, String newDetailMessage) { Field field; try { field = Throwable.class.getDeclaredField("detailMessage"); } catch (NoSuchFieldException e) { throw new IllegalStateException( "The JDK Throwable contains a detailMessage member. The Throwable class provided on the classpath does not which is why this exception appears.", e); } field.setAccessible(true); try { field.set(throwable, newDetailMessage); } catch (IllegalAccessException e) { throw new IllegalStateException( "The JDK Throwable contains a private detailMessage member that should be accessible through reflection. This is not the case for the Throwable class provided on the classpath.", e); } } /** * Checks whether the given exception indicates a JVM metaspace out-of-memory error. * * @param t The exception to check. * @return True, if the exception is the metaspace {@link OutOfMemoryError}, false otherwise. */ public static boolean isMetaspaceOutOfMemoryError(@Nullable Throwable t) { return isOutOfMemoryErrorWithMessageStartingWith(t, "Metaspace"); } /** * Checks whether the given exception indicates a JVM direct out-of-memory error. * * @param t The exception to check. * @return True, if the exception is the direct {@link OutOfMemoryError}, false otherwise. */ public static boolean isDirectOutOfMemoryError(@Nullable Throwable t) { return isOutOfMemoryErrorWithMessageStartingWith(t, "Direct buffer memory"); } public static boolean isHeapSpaceOutOfMemoryError(@Nullable Throwable t) { return isOutOfMemoryErrorWithMessageStartingWith(t, "Java heap space"); } private static boolean isOutOfMemoryErrorWithMessageStartingWith( @Nullable Throwable t, String prefix) { // the exact matching of the class is checked to avoid matching any custom subclasses of // OutOfMemoryError // as we are interested in the original exceptions, generated by JVM. return isOutOfMemoryError(t) && t.getMessage() != null && t.getMessage().startsWith(prefix); } private static boolean isOutOfMemoryError(@Nullable Throwable t) { return t != null && t.getClass() == OutOfMemoryError.class; } /** * Rethrows the given {@code Throwable}, if it represents an error that is fatal to the JVM. See * {@link ExceptionUtils#isJvmFatalError(Throwable)} for a definition of fatal errors. * * @param t The Throwable to check and rethrow. */ public static void rethrowIfFatalError(Throwable t) { if (isJvmFatalError(t)) { throw (Error) t; } } /** * Rethrows the given {@code Throwable}, if it represents an error that is fatal to the JVM or * an out-of-memory error. See {@link ExceptionUtils#isJvmFatalError(Throwable)} for a * definition of fatal errors. * * @param t The Throwable to check and rethrow. */ public static void rethrowIfFatalErrorOrOOM(Throwable t) { if (isJvmFatalError(t) || t instanceof OutOfMemoryError) { throw (Error) t; } } /** * Adds a new exception as a {@link Throwable#addSuppressed(Throwable) suppressed exception} to * a prior exception, or returns the new exception, if no prior exception exists. * *

{@code
     * public void closeAllThings() throws Exception {
     *     Exception ex = null;
     *     try {
     *         component.shutdown();
     *     } catch (Exception e) {
     *         ex = firstOrSuppressed(e, ex);
     *     }
     *     try {
     *         anotherComponent.stop();
     *     } catch (Exception e) {
     *         ex = firstOrSuppressed(e, ex);
     *     }
     *     try {
     *         lastComponent.shutdown();
     *     } catch (Exception e) {
     *         ex = firstOrSuppressed(e, ex);
     *     }
     *
     *     if (ex != null) {
     *         throw ex;
     *     }
     * }
     * }
* * @param newException The newly occurred exception * @param previous The previously occurred exception, possibly null. * @return The new exception, if no previous exception exists, or the previous exception with * the new exception in the list of suppressed exceptions. */ public static T firstOrSuppressed(T newException, @Nullable T previous) { checkNotNull(newException, "newException"); if (previous == null) { return newException; } else { previous.addSuppressed(newException); return previous; } } /** * Throws the given {@code Throwable} in scenarios where the signatures do not allow you to * throw an arbitrary Throwable. Errors and RuntimeExceptions are thrown directly, other * exceptions are packed into runtime exceptions * * @param t The throwable to be thrown. */ public static void rethrow(Throwable t) { if (t instanceof Error) { throw (Error) t; } else if (t instanceof RuntimeException) { throw (RuntimeException) t; } else { throw new RuntimeException(t); } } /** * Throws the given {@code Throwable} in scenarios where the signatures do not allow you to * throw an arbitrary Throwable. Errors and RuntimeExceptions are thrown directly, other * exceptions are packed into a parent RuntimeException. * * @param t The throwable to be thrown. * @param parentMessage The message for the parent RuntimeException, if one is needed. */ public static void rethrow(Throwable t, String parentMessage) { if (t instanceof Error) { throw (Error) t; } else if (t instanceof RuntimeException) { throw (RuntimeException) t; } else { throw new RuntimeException(parentMessage, t); } } /** * Throws the given {@code Throwable} in scenarios where the signatures do allow to throw a * Exception. Errors and Exceptions are thrown directly, other "exotic" subclasses of Throwable * are wrapped in an Exception. * * @param t The throwable to be thrown. * @param parentMessage The message for the parent Exception, if one is needed. */ public static void rethrowException(Throwable t, String parentMessage) throws Exception { if (t instanceof Error) { throw (Error) t; } else if (t instanceof Exception) { throw (Exception) t; } else { throw new Exception(parentMessage, t); } } /** * Throws the given {@code Throwable} in scenarios where the signatures do allow to throw a * Exception. Errors and Exceptions are thrown directly, other "exotic" subclasses of Throwable * are wrapped in an Exception. * * @param t The throwable to be thrown. */ public static void rethrowException(Throwable t) throws Exception { if (t instanceof Error) { throw (Error) t; } else if (t instanceof Exception) { throw (Exception) t; } else { throw new Exception(t.getMessage(), t); } } /** * Tries to throw the given exception if not null. * * @param e exception to throw if not null. * @throws Exception */ public static void tryRethrowException(@Nullable Exception e) throws Exception { if (e != null) { throw e; } } /** * Tries to throw the given {@code Throwable} in scenarios where the signatures allows only * IOExceptions (and RuntimeException and Error). Throws this exception directly, if it is an * IOException, a RuntimeException, or an Error. Otherwise does nothing. * * @param t The Throwable to be thrown. */ public static void tryRethrowIOException(Throwable t) throws IOException { if (t instanceof IOException) { throw (IOException) t; } else if (t instanceof RuntimeException) { throw (RuntimeException) t; } else if (t instanceof Error) { throw (Error) t; } } /** * Re-throws the given {@code Throwable} in scenarios where the signatures allows only * IOExceptions (and RuntimeException and Error). * *

Throws this exception directly, if it is an IOException, a RuntimeException, or an Error. * Otherwise it wraps it in an IOException and throws it. * * @param t The Throwable to be thrown. */ public static void rethrowIOException(Throwable t) throws IOException { if (t instanceof IOException) { throw (IOException) t; } else if (t instanceof RuntimeException) { throw (RuntimeException) t; } else if (t instanceof Error) { throw (Error) t; } else { throw new IOException(t.getMessage(), t); } } /** * Checks whether a throwable chain contains a specific type of exception and returns it. It * deserializes any {@link SerializedThrowable} that are found using the provided {@link * ClassLoader}. * * @param throwable the throwable chain to check. * @param searchType the type of exception to search for in the chain. * @param classLoader to use for deserialization. * @return Optional throwable of the requested type if available, otherwise empty */ public static Optional findSerializedThrowable( Throwable throwable, Class searchType, ClassLoader classLoader) { if (throwable == null || searchType == null) { return Optional.empty(); } Throwable t = throwable; while (t != null) { if (searchType.isAssignableFrom(t.getClass())) { return Optional.of(searchType.cast(t)); } else if (t.getClass().isAssignableFrom(SerializedThrowable.class)) { Throwable next = ((SerializedThrowable) t).deserializeError(classLoader); // SerializedThrowable#deserializeError returns itself under some conditions (e.g., // null cause). // If that happens, exit to avoid looping infinitely. This is ok because if the user // was searching // for a SerializedThrowable, we would have returned it in the initial if condition. t = (next == t) ? null : next; } else { t = t.getCause(); } } return Optional.empty(); } /** * Checks whether a throwable chain contains a specific type of exception and returns it. * * @param throwable the throwable chain to check. * @param searchType the type of exception to search for in the chain. * @return Optional throwable of the requested type if available, otherwise empty */ public static Optional findThrowable( Throwable throwable, Class searchType) { if (throwable == null || searchType == null) { return Optional.empty(); } // TODO: ------------ start:二次开发代码 --------------- // ExceptionBus.post(throwable, ""); // TODO: ------------ end:二次开发代码 --------------- // Throwable t = throwable; while (t != null) { if (searchType.isAssignableFrom(t.getClass())) { return Optional.of(searchType.cast(t)); } else { t = t.getCause(); } } return Optional.empty(); } /** * Checks whether a throwable chain contains a specific type of exception and returns it. This * method handles {@link SerializedThrowable}s in the chain and deserializes them with the given * ClassLoader. * *

SerializedThrowables are often used when exceptions might come from dynamically loaded * code and be transported over RPC / HTTP for better error reporting. The receiving processes * or threads might not have the dynamically loaded code available. * * @param throwable the throwable chain to check. * @param searchType the type of exception to search for in the chain. * @param classLoader the ClassLoader to use when encountering a SerializedThrowable. * @return Optional throwable of the requested type if available, otherwise empty */ public static Optional findThrowableSerializedAware( Throwable throwable, Class searchType, ClassLoader classLoader) { if (throwable == null || searchType == null) { return Optional.empty(); } Throwable t = throwable; while (t != null) { if (searchType.isAssignableFrom(t.getClass())) { return Optional.of(searchType.cast(t)); } else if (t instanceof SerializedThrowable) { t = ((SerializedThrowable) t).deserializeError(classLoader); } else { t = t.getCause(); } } return Optional.empty(); } /** * Checks whether a throwable chain contains an exception matching a predicate and returns it. * * @param throwable the throwable chain to check. * @param predicate the predicate of the exception to search for in the chain. * @return Optional throwable of the requested type if available, otherwise empty */ public static Optional findThrowable( Throwable throwable, Predicate predicate) { if (throwable == null || predicate == null) { return Optional.empty(); } Throwable t = throwable; while (t != null) { if (predicate.test(t)) { return Optional.of(t); } else { t = t.getCause(); } } return Optional.empty(); } /** * Checks whether a throwable chain contains a specific error message and returns the * corresponding throwable. * * @param throwable the throwable chain to check. * @param searchMessage the error message to search for in the chain. * @return Optional throwable containing the search message if available, otherwise empty */ public static Optional findThrowableWithMessage( Throwable throwable, String searchMessage) { if (throwable == null || searchMessage == null) { return Optional.empty(); } Throwable t = throwable; while (t != null) { if (t.getMessage() != null && t.getMessage().contains(searchMessage)) { return Optional.of(t); } else { t = t.getCause(); } } return Optional.empty(); } /** * Unpacks an {@link ExecutionException} and returns its cause. Otherwise the given Throwable is * returned. * * @param throwable to unpack if it is an ExecutionException * @return Cause of ExecutionException or given Throwable */ public static Throwable stripExecutionException(Throwable throwable) { return stripException(throwable, ExecutionException.class); } /** * Unpacks an {@link CompletionException} and returns its cause. Otherwise the given Throwable * is returned. * * @param throwable to unpack if it is an CompletionException * @return Cause of CompletionException or given Throwable */ public static Throwable stripCompletionException(Throwable throwable) { return stripException(throwable, CompletionException.class); } /** * Unpacks an specified exception and returns its cause. Otherwise the given {@link Throwable} * is returned. * * @param throwableToStrip to strip * @param typeToStrip type to strip * @return Unpacked cause or given Throwable if not packed */ public static Throwable stripException( Throwable throwableToStrip, Class typeToStrip) { while (typeToStrip.isAssignableFrom(throwableToStrip.getClass()) && throwableToStrip.getCause() != null) { throwableToStrip = throwableToStrip.getCause(); } return throwableToStrip; } /** * Tries to find a {@link SerializedThrowable} as the cause of the given throwable and throws * its deserialized value. If there is no such throwable, then the original throwable is thrown. * * @param throwable to check for a SerializedThrowable * @param classLoader to be used for the deserialization of the SerializedThrowable * @throws Throwable either the deserialized throwable or the given throwable */ public static void tryDeserializeAndThrow(Throwable throwable, ClassLoader classLoader) throws Throwable { Throwable current = throwable; while (!(current instanceof SerializedThrowable) && current.getCause() != null) { current = current.getCause(); } if (current instanceof SerializedThrowable) { throw ((SerializedThrowable) current).deserializeError(classLoader); } else { throw throwable; } } /** * Checks whether the given exception is a {@link InterruptedException} and sets the interrupted * flag accordingly. * * @param e to check whether it is an {@link InterruptedException} */ public static void checkInterrupted(Throwable e) { if (e instanceof InterruptedException) { Thread.currentThread().interrupt(); } } // ------------------------------------------------------------------------ // Lambda exception utilities // ------------------------------------------------------------------------ public static void suppressExceptions(RunnableWithException action) { try { action.run(); } catch (InterruptedException e) { // restore interrupted state Thread.currentThread().interrupt(); } catch (Throwable t) { if (isJvmFatalError(t)) { rethrow(t); } } } // ------------------------------------------------------------------------ /** Private constructor to prevent instantiation. */ private ExceptionUtils() {} } ================================================ FILE: fire-enhance/apache-flink/src/main/java-flink-1.14/org/rocksdb/RocksDB.java ================================================ // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). package org.rocksdb; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import com.zto.fire.common.util.PropUtils; import org.rocksdb.util.Environment; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A RocksDB is a persistent ordered map from keys to values. It is safe for * concurrent access from multiple threads without any external synchronization. * All methods of this class could potentially throw RocksDBException, which * indicates sth wrong at the RocksDB library side and the call failed. */ public class RocksDB extends RocksObject { public static final byte[] DEFAULT_COLUMN_FAMILY = "default".getBytes(); public static final int NOT_FOUND = -1; // TODO: ------------ start:二次开发代码 --------------- // // 当状态获取耗时超过该阈值时将记录日志,小于1表示不记录日志 protected long logThreshold = PropUtils.getLong("flink.state.log.threshold", 50, 1); // 当状态获取耗时超过该阈值时将记录日志的日志条数,小于1表示不限行数 protected long logThresholdMaxCount = PropUtils.getLong("flink.state.log.threshold.max_count", 300000, 1); protected AtomicLong currentLogCount = new AtomicLong(); protected static Logger logger = LoggerFactory.getLogger(RocksDB.class); /** * 用于计算状态获取的耗时 */ protected void elapsed(long start) { long elapsed = System.currentTimeMillis() - start; // 当且仅当以下两个条件满足时才会记录耗时日志: // 1. 当flink.state.log.threshold配置的阈值大于0时 // 2. 当flink.state.log.threshold.max_count配置的值小于1时或者当记录的行数小于配置的值时 if (this.logThreshold > 0 && (this.logThresholdMaxCount < 1 || this.currentLogCount.get() <= this.logThresholdMaxCount)) { if (elapsed >= this.logThreshold * 2) { logger.warn("RocksDB state get elapsed:{}ms.", elapsed); this.currentLogCount.incrementAndGet(); } else if (elapsed >= this.logThreshold) { logger.info("RocksDB state get elapsed:{}ms.", elapsed); this.currentLogCount.incrementAndGet(); } } } // TODO: ------------ end:二次开发代码 --------------- // private enum LibraryState { NOT_LOADED, LOADING, LOADED } private static AtomicReference libraryLoaded = new AtomicReference<>(LibraryState.NOT_LOADED); static { RocksDB.loadLibrary(); } private List ownedColumnFamilyHandles = new ArrayList<>(); /** * Loads the necessary library files. * Calling this method twice will have no effect. * By default the method extracts the shared library for loading at * java.io.tmpdir, however, you can override this temporary location by * setting the environment variable ROCKSDB_SHAREDLIB_DIR. */ public static void loadLibrary() { if (libraryLoaded.get() == LibraryState.LOADED) { return; } if (libraryLoaded.compareAndSet(LibraryState.NOT_LOADED, LibraryState.LOADING)) { final String tmpDir = System.getenv("ROCKSDB_SHAREDLIB_DIR"); // loading possibly necessary libraries. for (final CompressionType compressionType : CompressionType.values()) { try { if (compressionType.getLibraryName() != null) { System.loadLibrary(compressionType.getLibraryName()); } } catch (final UnsatisfiedLinkError e) { // since it may be optional, we ignore its loading failure here. } } try { NativeLibraryLoader.getInstance().loadLibrary(tmpDir); } catch (final IOException e) { libraryLoaded.set(LibraryState.NOT_LOADED); throw new RuntimeException("Unable to load the RocksDB shared library", e); } final int encodedVersion = version(); version = Version.fromEncodedVersion(encodedVersion); libraryLoaded.set(LibraryState.LOADED); return; } while (libraryLoaded.get() == LibraryState.LOADING) { try { Thread.sleep(10); } catch(final InterruptedException e) { //ignore } } } /** * Tries to load the necessary library files from the given list of * directories. * * @param paths a list of strings where each describes a directory * of a library. */ public static void loadLibrary(final List paths) { if (libraryLoaded.get() == LibraryState.LOADED) { return; } if (libraryLoaded.compareAndSet(LibraryState.NOT_LOADED, LibraryState.LOADING)) { for (final CompressionType compressionType : CompressionType.values()) { if (compressionType.equals(CompressionType.NO_COMPRESSION)) { continue; } for (final String path : paths) { try { System.load(path + "/" + Environment.getSharedLibraryFileName( compressionType.getLibraryName())); break; } catch (final UnsatisfiedLinkError e) { // since they are optional, we ignore loading fails. } } } boolean success = false; UnsatisfiedLinkError err = null; for (final String path : paths) { try { System.load(path + "/" + Environment.getJniLibraryFileName("rocksdbjni")); success = true; break; } catch (final UnsatisfiedLinkError e) { err = e; } } if (!success) { libraryLoaded.set(LibraryState.NOT_LOADED); throw err; } final int encodedVersion = version(); version = Version.fromEncodedVersion(encodedVersion); libraryLoaded.set(LibraryState.LOADED); return; } while (libraryLoaded.get() == LibraryState.LOADING) { try { Thread.sleep(10); } catch(final InterruptedException e) { //ignore } } } public static Version rocksdbVersion() { return version; } /** * Private constructor. * * @param nativeHandle The native handle of the C++ RocksDB object */ protected RocksDB(final long nativeHandle) { super(nativeHandle); } /** * The factory constructor of RocksDB that opens a RocksDB instance given * the path to the database using the default options w/ createIfMissing * set to true. * * @param path the path to the rocksdb. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. * @see Options#setCreateIfMissing(boolean) */ public static RocksDB open(final String path) throws RocksDBException { final Options options = new Options(); options.setCreateIfMissing(true); return open(options, path); } /** * The factory constructor of RocksDB that opens a RocksDB instance given * the path to the database using the specified options and db path and a list * of column family names. *

* If opened in read write mode every existing column family name must be * passed within the list to this method.

*

* If opened in read-only mode only a subset of existing column families must * be passed to this method.

*

* Options instance *should* not be disposed before all DBs using this options * instance have been closed. If user doesn't call options dispose explicitly, * then this options instance will be GC'd automatically

*

* ColumnFamily handles are disposed when the RocksDB instance is disposed. *

* * @param path the path to the rocksdb. * @param columnFamilyDescriptors list of column family descriptors * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances * on open. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. * @see DBOptions#setCreateIfMissing(boolean) */ public static RocksDB open(final String path, final List columnFamilyDescriptors, final List columnFamilyHandles) throws RocksDBException { final DBOptions options = new DBOptions(); return open(options, path, columnFamilyDescriptors, columnFamilyHandles); } /** * The factory constructor of RocksDB that opens a RocksDB instance given * the path to the database using the specified options and db path. * *

* Options instance *should* not be disposed before all DBs using this options * instance have been closed. If user doesn't call options dispose explicitly, * then this options instance will be GC'd automatically.

*

* Options instance can be re-used to open multiple DBs if DB statistics is * not used. If DB statistics are required, then its recommended to open DB * with new Options instance as underlying native statistics instance does not * use any locks to prevent concurrent updates.

* * @param options {@link org.rocksdb.Options} instance. * @param path the path to the rocksdb. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. * * @see Options#setCreateIfMissing(boolean) */ public static RocksDB open(final Options options, final String path) throws RocksDBException { // when non-default Options is used, keeping an Options reference // in RocksDB can prevent Java to GC during the life-time of // the currently-created RocksDB. final RocksDB db = new RocksDB(open(options.nativeHandle_, path)); db.storeOptionsInstance(options); return db; } /** * The factory constructor of RocksDB that opens a RocksDB instance given * the path to the database using the specified options and db path and a list * of column family names. *

* If opened in read write mode every existing column family name must be * passed within the list to this method.

*

* If opened in read-only mode only a subset of existing column families must * be passed to this method.

*

* Options instance *should* not be disposed before all DBs using this options * instance have been closed. If user doesn't call options dispose explicitly, * then this options instance will be GC'd automatically.

*

* Options instance can be re-used to open multiple DBs if DB statistics is * not used. If DB statistics are required, then its recommended to open DB * with new Options instance as underlying native statistics instance does not * use any locks to prevent concurrent updates.

*

* ColumnFamily handles are disposed when the RocksDB instance is disposed. *

* * @param options {@link org.rocksdb.DBOptions} instance. * @param path the path to the rocksdb. * @param columnFamilyDescriptors list of column family descriptors * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances * on open. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. * * @see DBOptions#setCreateIfMissing(boolean) */ public static RocksDB open(final DBOptions options, final String path, final List columnFamilyDescriptors, final List columnFamilyHandles) throws RocksDBException { final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][]; final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()]; for (int i = 0; i < columnFamilyDescriptors.size(); i++) { final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors .get(i); cfNames[i] = cfDescriptor.getName(); cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_; } final long[] handles = open(options.nativeHandle_, path, cfNames, cfOptionHandles); final RocksDB db = new RocksDB(handles[0]); db.storeOptionsInstance(options); for (int i = 1; i < handles.length; i++) { final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]); columnFamilyHandles.add(columnFamilyHandle); } db.ownedColumnFamilyHandles.addAll(columnFamilyHandles); return db; } /** * The factory constructor of RocksDB that opens a RocksDB instance in * Read-Only mode given the path to the database using the default * options. * * @param path the path to the RocksDB. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public static RocksDB openReadOnly(final String path) throws RocksDBException { // This allows to use the rocksjni default Options instead of // the c++ one. final Options options = new Options(); return openReadOnly(options, path); } /** * The factory constructor of RocksDB that opens a RocksDB instance in * Read-Only mode given the path to the database using the specified * options and db path. * * Options instance *should* not be disposed before all DBs using this options * instance have been closed. If user doesn't call options dispose explicitly, * then this options instance will be GC'd automatically. * * @param options {@link Options} instance. * @param path the path to the RocksDB. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public static RocksDB openReadOnly(final Options options, final String path) throws RocksDBException { return openReadOnly(options, path, false); } /** * The factory constructor of RocksDB that opens a RocksDB instance in * Read-Only mode given the path to the database using the specified * options and db path. * * Options instance *should* not be disposed before all DBs using this options * instance have been closed. If user doesn't call options dispose explicitly, * then this options instance will be GC'd automatically. * * @param options {@link Options} instance. * @param path the path to the RocksDB. * @param errorIfWalFileExists true to raise an error when opening the db * if a Write Ahead Log file exists, false otherwise. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public static RocksDB openReadOnly(final Options options, final String path, final boolean errorIfWalFileExists) throws RocksDBException { // when non-default Options is used, keeping an Options reference // in RocksDB can prevent Java to GC during the life-time of // the currently-created RocksDB. final RocksDB db = new RocksDB(openROnly(options.nativeHandle_, path, errorIfWalFileExists)); db.storeOptionsInstance(options); return db; } /** * The factory constructor of RocksDB that opens a RocksDB instance in * Read-Only mode given the path to the database using the default * options. * * @param path the path to the RocksDB. * @param columnFamilyDescriptors list of column family descriptors * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances * on open. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public static RocksDB openReadOnly(final String path, final List columnFamilyDescriptors, final List columnFamilyHandles) throws RocksDBException { // This allows to use the rocksjni default Options instead of // the c++ one. final DBOptions options = new DBOptions(); return openReadOnly(options, path, columnFamilyDescriptors, columnFamilyHandles, false); } /** * The factory constructor of RocksDB that opens a RocksDB instance in * Read-Only mode given the path to the database using the specified * options and db path. * *

This open method allows to open RocksDB using a subset of available * column families

*

Options instance *should* not be disposed before all DBs using this * options instance have been closed. If user doesn't call options dispose * explicitly,then this options instance will be GC'd automatically.

* * @param options {@link DBOptions} instance. * @param path the path to the RocksDB. * @param columnFamilyDescriptors list of column family descriptors * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances * on open. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public static RocksDB openReadOnly(final DBOptions options, final String path, final List columnFamilyDescriptors, final List columnFamilyHandles) throws RocksDBException { return openReadOnly(options, path, columnFamilyDescriptors, columnFamilyHandles, false); } /** * The factory constructor of RocksDB that opens a RocksDB instance in * Read-Only mode given the path to the database using the specified * options and db path. * *

This open method allows to open RocksDB using a subset of available * column families

*

Options instance *should* not be disposed before all DBs using this * options instance have been closed. If user doesn't call options dispose * explicitly,then this options instance will be GC'd automatically.

* * @param options {@link DBOptions} instance. * @param path the path to the RocksDB. * @param columnFamilyDescriptors list of column family descriptors * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances * on open. * @param errorIfWalFileExists true to raise an error when opening the db * if a Write Ahead Log file exists, false otherwise. * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public static RocksDB openReadOnly(final DBOptions options, final String path, final List columnFamilyDescriptors, final List columnFamilyHandles, final boolean errorIfWalFileExists) throws RocksDBException { // when non-default Options is used, keeping an Options reference // in RocksDB can prevent Java to GC during the life-time of // the currently-created RocksDB. final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][]; final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()]; for (int i = 0; i < columnFamilyDescriptors.size(); i++) { final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors .get(i); cfNames[i] = cfDescriptor.getName(); cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_; } final long[] handles = openROnly(options.nativeHandle_, path, cfNames, cfOptionHandles, errorIfWalFileExists); final RocksDB db = new RocksDB(handles[0]); db.storeOptionsInstance(options); for (int i = 1; i < handles.length; i++) { final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]); columnFamilyHandles.add(columnFamilyHandle); } db.ownedColumnFamilyHandles.addAll(columnFamilyHandles); return db; } /** * Open DB as secondary instance with only the default column family. * * The secondary instance can dynamically tail the MANIFEST of * a primary that must have already been created. User can call * {@link #tryCatchUpWithPrimary()} to make the secondary instance catch up * with primary (WAL tailing is NOT supported now) whenever the user feels * necessary. Column families created by the primary after the secondary * instance starts are currently ignored by the secondary instance. * Column families opened by secondary and dropped by the primary will be * dropped by secondary as well. However the user of the secondary instance * can still access the data of such dropped column family as long as they * do not destroy the corresponding column family handle. * WAL tailing is not supported at present, but will arrive soon. * * @param options the options to open the secondary instance. * @param path the path to the primary RocksDB instance. * @param secondaryPath points to a directory where the secondary instance * stores its info log * * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public static RocksDB openAsSecondary(final Options options, final String path, final String secondaryPath) throws RocksDBException { // when non-default Options is used, keeping an Options reference // in RocksDB can prevent Java to GC during the life-time of // the currently-created RocksDB. final RocksDB db = new RocksDB(openAsSecondary(options.nativeHandle_, path, secondaryPath)); db.storeOptionsInstance(options); return db; } /** * Open DB as secondary instance with column families. * You can open a subset of column families in secondary mode. * * The secondary instance can dynamically tail the MANIFEST of * a primary that must have already been created. User can call * {@link #tryCatchUpWithPrimary()} to make the secondary instance catch up * with primary (WAL tailing is NOT supported now) whenever the user feels * necessary. Column families created by the primary after the secondary * instance starts are currently ignored by the secondary instance. * Column families opened by secondary and dropped by the primary will be * dropped by secondary as well. However the user of the secondary instance * can still access the data of such dropped column family as long as they * do not destroy the corresponding column family handle. * WAL tailing is not supported at present, but will arrive soon. * * @param options the options to open the secondary instance. * @param path the path to the primary RocksDB instance. * @param secondaryPath points to a directory where the secondary instance * stores its info log. * @param columnFamilyDescriptors list of column family descriptors * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances * on open. * * @return a {@link RocksDB} instance on success, null if the specified * {@link RocksDB} can not be opened. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public static RocksDB openAsSecondary(final DBOptions options, final String path, final String secondaryPath, final List columnFamilyDescriptors, final List columnFamilyHandles) throws RocksDBException { // when non-default Options is used, keeping an Options reference // in RocksDB can prevent Java to GC during the life-time of // the currently-created RocksDB. final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][]; final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()]; for (int i = 0; i < columnFamilyDescriptors.size(); i++) { final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors.get(i); cfNames[i] = cfDescriptor.getName(); cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_; } final long[] handles = openAsSecondary(options.nativeHandle_, path, secondaryPath, cfNames, cfOptionHandles); final RocksDB db = new RocksDB(handles[0]); db.storeOptionsInstance(options); for (int i = 1; i < handles.length; i++) { final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]); columnFamilyHandles.add(columnFamilyHandle); } db.ownedColumnFamilyHandles.addAll(columnFamilyHandles); return db; } /** * This is similar to {@link #close()} except that it * throws an exception if any error occurs. * * This will not fsync the WAL files. * If syncing is required, the caller must first call {@link #syncWal()} * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch * with {@link WriteOptions#setSync(boolean)} set to true. * * See also {@link #close()}. * * @throws RocksDBException if an error occurs whilst closing. */ public void closeE() throws RocksDBException { for (final ColumnFamilyHandle columnFamilyHandle : ownedColumnFamilyHandles) { columnFamilyHandle.close(); } ownedColumnFamilyHandles.clear(); if (owningHandle_.compareAndSet(true, false)) { try { closeDatabase(nativeHandle_); } finally { disposeInternal(); } } } /** * This is similar to {@link #closeE()} except that it * silently ignores any errors. * * This will not fsync the WAL files. * If syncing is required, the caller must first call {@link #syncWal()} * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch * with {@link WriteOptions#setSync(boolean)} set to true. * * See also {@link #close()}. */ @Override public void close() { for (final ColumnFamilyHandle columnFamilyHandle : ownedColumnFamilyHandles) { columnFamilyHandle.close(); } ownedColumnFamilyHandles.clear(); if (owningHandle_.compareAndSet(true, false)) { try { closeDatabase(nativeHandle_); } catch (final RocksDBException e) { // silently ignore the error report } finally { disposeInternal(); } } } /** * Static method to determine all available column families for a * rocksdb database identified by path * * @param options Options for opening the database * @param path Absolute path to rocksdb database * @return List<byte[]> List containing the column family names * * @throws RocksDBException thrown if error happens in underlying * native library. */ public static List listColumnFamilies(final Options options, final String path) throws RocksDBException { return Arrays.asList(RocksDB.listColumnFamilies(options.nativeHandle_, path)); } /** * Creates a new column family with the name columnFamilyName and * allocates a ColumnFamilyHandle within an internal structure. * The ColumnFamilyHandle is automatically disposed with DB disposal. * * @param columnFamilyDescriptor column family to be created. * @return {@link org.rocksdb.ColumnFamilyHandle} instance. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public ColumnFamilyHandle createColumnFamily( final ColumnFamilyDescriptor columnFamilyDescriptor) throws RocksDBException { final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, createColumnFamily(nativeHandle_, columnFamilyDescriptor.getName(), columnFamilyDescriptor.getName().length, columnFamilyDescriptor.getOptions().nativeHandle_)); ownedColumnFamilyHandles.add(columnFamilyHandle); return columnFamilyHandle; } /** * Bulk create column families with the same column family options. * * @param columnFamilyOptions the options for the column families. * @param columnFamilyNames the names of the column families. * * @return the handles to the newly created column families. * * @throws RocksDBException if an error occurs whilst creating * the column families */ public List createColumnFamilies( final ColumnFamilyOptions columnFamilyOptions, final List columnFamilyNames) throws RocksDBException { final byte[][] cfNames = columnFamilyNames.toArray( new byte[0][]); final long[] cfHandles = createColumnFamilies(nativeHandle_, columnFamilyOptions.nativeHandle_, cfNames); final List columnFamilyHandles = new ArrayList<>(cfHandles.length); for (int i = 0; i < cfHandles.length; i++) { final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, cfHandles[i]); columnFamilyHandles.add(columnFamilyHandle); } ownedColumnFamilyHandles.addAll(columnFamilyHandles); return columnFamilyHandles; } /** * Bulk create column families with the same column family options. * * @param columnFamilyDescriptors the descriptions of the column families. * * @return the handles to the newly created column families. * * @throws RocksDBException if an error occurs whilst creating * the column families */ public List createColumnFamilies( final List columnFamilyDescriptors) throws RocksDBException { final long[] cfOptsHandles = new long[columnFamilyDescriptors.size()]; final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][]; for (int i = 0; i < columnFamilyDescriptors.size(); i++) { final ColumnFamilyDescriptor columnFamilyDescriptor = columnFamilyDescriptors.get(i); cfOptsHandles[i] = columnFamilyDescriptor.getOptions().nativeHandle_; cfNames[i] = columnFamilyDescriptor.getName(); } final long[] cfHandles = createColumnFamilies(nativeHandle_, cfOptsHandles, cfNames); final List columnFamilyHandles = new ArrayList<>(cfHandles.length); for (int i = 0; i < cfHandles.length; i++) { final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, cfHandles[i]); columnFamilyHandles.add(columnFamilyHandle); } ownedColumnFamilyHandles.addAll(columnFamilyHandles); return columnFamilyHandles; } /** * Drops the column family specified by {@code columnFamilyHandle}. This call * only records a drop record in the manifest and prevents the column * family from flushing and compacting. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void dropColumnFamily(final ColumnFamilyHandle columnFamilyHandle) throws RocksDBException { dropColumnFamily(nativeHandle_, columnFamilyHandle.nativeHandle_); } // Bulk drop column families. This call only records drop records in the // manifest and prevents the column families from flushing and compacting. // In case of error, the request may succeed partially. User may call // ListColumnFamilies to check the result. public void dropColumnFamilies( final List columnFamilies) throws RocksDBException { final long[] cfHandles = new long[columnFamilies.size()]; for (int i = 0; i < columnFamilies.size(); i++) { cfHandles[i] = columnFamilies.get(i).nativeHandle_; } dropColumnFamilies(nativeHandle_, cfHandles); } /** * Deletes native column family handle of given {@link ColumnFamilyHandle} Java object * and removes reference from {@link RocksDB#ownedColumnFamilyHandles}. * * @param columnFamilyHandle column family handle object. */ public void destroyColumnFamilyHandle(final ColumnFamilyHandle columnFamilyHandle) { for (int i = 0; i < ownedColumnFamilyHandles.size(); ++i) { final ColumnFamilyHandle ownedHandle = ownedColumnFamilyHandles.get(i); if (ownedHandle.equals(columnFamilyHandle)) { columnFamilyHandle.close(); ownedColumnFamilyHandles.remove(i); return; } } } /** * Set the database entry for "key" to "value". * * @param key the specified key to be inserted. * @param value the value associated with the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void put(final byte[] key, final byte[] value) throws RocksDBException { put(nativeHandle_, key, 0, key.length, value, 0, value.length); } /** * Set the database entry for "key" to "value". * * @param key The specified key to be inserted * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than ("key".length - offset) * @param value the value associated with the specified key * @param vOffset the offset of the "value" array to be used, must be * non-negative and no longer than "key".length * @param vLen the length of the "value" array to be used, must be * non-negative and no larger than ("value".length - offset) * * @throws RocksDBException thrown if errors happens in underlying native * library. * @throws IndexOutOfBoundsException if an offset or length is out of bounds */ public void put(final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { checkBounds(offset, len, key.length); checkBounds(vOffset, vLen, value.length); put(nativeHandle_, key, offset, len, value, vOffset, vLen); } /** * Set the database entry for "key" to "value" in the specified * column family. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param key the specified key to be inserted. * @param value the value associated with the specified key. * * throws IllegalArgumentException if column family is not present * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final byte[] value) throws RocksDBException { put(nativeHandle_, key, 0, key.length, value, 0, value.length, columnFamilyHandle.nativeHandle_); } /** * Set the database entry for "key" to "value" in the specified * column family. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param key The specified key to be inserted * @param offset the offset of the "key" array to be used, must * be non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than ("key".length - offset) * @param value the value associated with the specified key * @param vOffset the offset of the "value" array to be used, must be * non-negative and no longer than "key".length * @param vLen the length of the "value" array to be used, must be * non-negative and no larger than ("value".length - offset) * * @throws RocksDBException thrown if errors happens in underlying native * library. * @throws IndexOutOfBoundsException if an offset or length is out of bounds */ public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { checkBounds(offset, len, key.length); checkBounds(vOffset, vLen, value.length); put(nativeHandle_, key, offset, len, value, vOffset, vLen, columnFamilyHandle.nativeHandle_); } /** * Set the database entry for "key" to "value". * * @param writeOpts {@link org.rocksdb.WriteOptions} instance. * @param key the specified key to be inserted. * @param value the value associated with the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void put(final WriteOptions writeOpts, final byte[] key, final byte[] value) throws RocksDBException { put(nativeHandle_, writeOpts.nativeHandle_, key, 0, key.length, value, 0, value.length); } /** * Set the database entry for "key" to "value". * * @param writeOpts {@link org.rocksdb.WriteOptions} instance. * @param key The specified key to be inserted * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than ("key".length - offset) * @param value the value associated with the specified key * @param vOffset the offset of the "value" array to be used, must be * non-negative and no longer than "key".length * @param vLen the length of the "value" array to be used, must be * non-negative and no larger than ("value".length - offset) * * @throws RocksDBException thrown if error happens in underlying * native library. * @throws IndexOutOfBoundsException if an offset or length is out of bounds */ public void put(final WriteOptions writeOpts, final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { checkBounds(offset, len, key.length); checkBounds(vOffset, vLen, value.length); put(nativeHandle_, writeOpts.nativeHandle_, key, offset, len, value, vOffset, vLen); } /** * Set the database entry for "key" to "value" for the specified * column family. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param writeOpts {@link org.rocksdb.WriteOptions} instance. * @param key the specified key to be inserted. * @param value the value associated with the specified key. * * throws IllegalArgumentException if column family is not present * * @throws RocksDBException thrown if error happens in underlying * native library. * @see IllegalArgumentException */ public void put(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpts, final byte[] key, final byte[] value) throws RocksDBException { put(nativeHandle_, writeOpts.nativeHandle_, key, 0, key.length, value, 0, value.length, columnFamilyHandle.nativeHandle_); } /** * Set the database entry for "key" to "value" for the specified * column family. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param writeOpts {@link org.rocksdb.WriteOptions} instance. * @param key the specified key to be inserted. Position and limit is used. * Supports direct buffer only. * @param value the value associated with the specified key. Position and limit is used. * Supports direct buffer only. * * throws IllegalArgumentException if column family is not present * * @throws RocksDBException thrown if error happens in underlying * native library. * @see IllegalArgumentException */ public void put(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpts, final ByteBuffer key, final ByteBuffer value) throws RocksDBException { assert key.isDirect() && value.isDirect(); putDirect(nativeHandle_, writeOpts.nativeHandle_, key, key.position(), key.remaining(), value, value.position(), value.remaining(), columnFamilyHandle.nativeHandle_); key.position(key.limit()); value.position(value.limit()); } /** * Set the database entry for "key" to "value". * * @param writeOpts {@link org.rocksdb.WriteOptions} instance. * @param key the specified key to be inserted. Position and limit is used. * Supports direct buffer only. * @param value the value associated with the specified key. Position and limit is used. * Supports direct buffer only. * * throws IllegalArgumentException if column family is not present * * @throws RocksDBException thrown if error happens in underlying * native library. * @see IllegalArgumentException */ public void put(final WriteOptions writeOpts, final ByteBuffer key, final ByteBuffer value) throws RocksDBException { assert key.isDirect() && value.isDirect(); putDirect(nativeHandle_, writeOpts.nativeHandle_, key, key.position(), key.remaining(), value, value.position(), value.remaining(), 0); key.position(key.limit()); value.position(value.limit()); } /** * Set the database entry for "key" to "value" for the specified * column family. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param writeOpts {@link org.rocksdb.WriteOptions} instance. * @param key The specified key to be inserted * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than ("key".length - offset) * @param value the value associated with the specified key * @param vOffset the offset of the "value" array to be used, must be * non-negative and no longer than "key".length * @param vLen the length of the "value" array to be used, must be * non-negative and no larger than ("value".length - offset) * * @throws RocksDBException thrown if error happens in underlying * native library. * @throws IndexOutOfBoundsException if an offset or length is out of bounds */ public void put(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpts, final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { checkBounds(offset, len, key.length); checkBounds(vOffset, vLen, value.length); put(nativeHandle_, writeOpts.nativeHandle_, key, offset, len, value, vOffset, vLen, columnFamilyHandle.nativeHandle_); } /** * Remove the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. * * @deprecated Use {@link #delete(byte[])} */ @Deprecated public void remove(final byte[] key) throws RocksDBException { delete(key); } /** * Delete the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void delete(final byte[] key) throws RocksDBException { delete(nativeHandle_, key, 0, key.length); } /** * Delete the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param key Key to delete within database * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be * non-negative and no larger than ("key".length - offset) * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void delete(final byte[] key, final int offset, final int len) throws RocksDBException { delete(nativeHandle_, key, offset, len); } /** * Remove the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. * * @deprecated Use {@link #delete(ColumnFamilyHandle, byte[])} */ @Deprecated public void remove(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) throws RocksDBException { delete(columnFamilyHandle, key); } /** * Delete the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void delete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) throws RocksDBException { delete(nativeHandle_, key, 0, key.length, columnFamilyHandle.nativeHandle_); } /** * Delete the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param key Key to delete within database * @param offset the offset of the "key" array to be used, * must be non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than ("value".length - offset) * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void delete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final int offset, final int len) throws RocksDBException { delete(nativeHandle_, key, offset, len, columnFamilyHandle.nativeHandle_); } /** * Remove the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param writeOpt WriteOptions to be used with delete operation * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. * * @deprecated Use {@link #delete(WriteOptions, byte[])} */ @Deprecated public void remove(final WriteOptions writeOpt, final byte[] key) throws RocksDBException { delete(writeOpt, key); } /** * Delete the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param writeOpt WriteOptions to be used with delete operation * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void delete(final WriteOptions writeOpt, final byte[] key) throws RocksDBException { delete(nativeHandle_, writeOpt.nativeHandle_, key, 0, key.length); } /** * Delete the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param writeOpt WriteOptions to be used with delete operation * @param key Key to delete within database * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be * non-negative and no larger than ("key".length - offset) * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void delete(final WriteOptions writeOpt, final byte[] key, final int offset, final int len) throws RocksDBException { delete(nativeHandle_, writeOpt.nativeHandle_, key, offset, len); } /** * Remove the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param writeOpt WriteOptions to be used with delete operation * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. * * @deprecated Use {@link #delete(ColumnFamilyHandle, WriteOptions, byte[])} */ @Deprecated public void remove(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpt, final byte[] key) throws RocksDBException { delete(columnFamilyHandle, writeOpt, key); } /** * Delete the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param writeOpt WriteOptions to be used with delete operation * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void delete(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpt, final byte[] key) throws RocksDBException { delete(nativeHandle_, writeOpt.nativeHandle_, key, 0, key.length, columnFamilyHandle.nativeHandle_); } /** * Delete the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param writeOpt WriteOptions to be used with delete operation * @param key Key to delete within database * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be * non-negative and no larger than ("key".length - offset) * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void delete(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpt, final byte[] key, final int offset, final int len) throws RocksDBException { delete(nativeHandle_, writeOpt.nativeHandle_, key, offset, len, columnFamilyHandle.nativeHandle_); } /** * Get the value associated with the specified key within column family. * * @param opt {@link org.rocksdb.ReadOptions} instance. * @param key the key to retrieve the value. It is using position and limit. * Supports direct buffer only. * @param value the out-value to receive the retrieved value. * It is using position and limit. Limit is set according to value size. * Supports direct buffer only. * @return The size of the actual value that matches the specified * {@code key} in byte. If the return value is greater than the * length of {@code value}, then it indicates that the size of the * input buffer {@code value} is insufficient and partial result will * be returned. RocksDB.NOT_FOUND will be returned if the value not * found. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public int get(final ReadOptions opt, final ByteBuffer key, final ByteBuffer value) throws RocksDBException { assert key.isDirect() && value.isDirect(); int result = getDirect(nativeHandle_, opt.nativeHandle_, key, key.position(), key.remaining(), value, value.position(), value.remaining(), 0); if (result != NOT_FOUND) { value.limit(Math.min(value.limit(), value.position() + result)); } key.position(key.limit()); return result; } /** * Get the value associated with the specified key within column family. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param opt {@link org.rocksdb.ReadOptions} instance. * @param key the key to retrieve the value. It is using position and limit. * Supports direct buffer only. * @param value the out-value to receive the retrieved value. * It is using position and limit. Limit is set according to value size. * Supports direct buffer only. * @return The size of the actual value that matches the specified * {@code key} in byte. If the return value is greater than the * length of {@code value}, then it indicates that the size of the * input buffer {@code value} is insufficient and partial result will * be returned. RocksDB.NOT_FOUND will be returned if the value not * found. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public int get(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions opt, final ByteBuffer key, final ByteBuffer value) throws RocksDBException { assert key.isDirect() && value.isDirect(); int result = getDirect(nativeHandle_, opt.nativeHandle_, key, key.position(), key.remaining(), value, value.position(), value.remaining(), columnFamilyHandle.nativeHandle_); if (result != NOT_FOUND) { value.limit(Math.min(value.limit(), value.position() + result)); } key.position(key.limit()); return result; } /** * Remove the database entry for {@code key}. Requires that the key exists * and was not overwritten. It is not an error if the key did not exist * in the database. * * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple * times), then the result of calling SingleDelete() on this key is undefined. * SingleDelete() only behaves correctly if there has been only one Put() * for this key since the previous call to SingleDelete() for this key. * * This feature is currently an experimental performance optimization * for a very specific workload. It is up to the caller to ensure that * SingleDelete is only used for a key that is not deleted using Delete() or * written using Merge(). Mixing SingleDelete operations with Deletes and * Merges can result in undefined behavior. * * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. */ @Experimental("Performance optimization for a very specific workload") public void singleDelete(final byte[] key) throws RocksDBException { singleDelete(nativeHandle_, key, key.length); } /** * Remove the database entry for {@code key}. Requires that the key exists * and was not overwritten. It is not an error if the key did not exist * in the database. * * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple * times), then the result of calling SingleDelete() on this key is undefined. * SingleDelete() only behaves correctly if there has been only one Put() * for this key since the previous call to SingleDelete() for this key. * * This feature is currently an experimental performance optimization * for a very specific workload. It is up to the caller to ensure that * SingleDelete is only used for a key that is not deleted using Delete() or * written using Merge(). Mixing SingleDelete operations with Deletes and * Merges can result in undefined behavior. * * @param columnFamilyHandle The column family to delete the key from * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. */ @Experimental("Performance optimization for a very specific workload") public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) throws RocksDBException { singleDelete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); } /** * Remove the database entry for {@code key}. Requires that the key exists * and was not overwritten. It is not an error if the key did not exist * in the database. * * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple * times), then the result of calling SingleDelete() on this key is undefined. * SingleDelete() only behaves correctly if there has been only one Put() * for this key since the previous call to SingleDelete() for this key. * * This feature is currently an experimental performance optimization * for a very specific workload. It is up to the caller to ensure that * SingleDelete is only used for a key that is not deleted using Delete() or * written using Merge(). Mixing SingleDelete operations with Deletes and * Merges can result in undefined behavior. * * Note: consider setting {@link WriteOptions#setSync(boolean)} true. * * @param writeOpt Write options for the delete * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. */ @Experimental("Performance optimization for a very specific workload") public void singleDelete(final WriteOptions writeOpt, final byte[] key) throws RocksDBException { singleDelete(nativeHandle_, writeOpt.nativeHandle_, key, key.length); } /** * Remove the database entry for {@code key}. Requires that the key exists * and was not overwritten. It is not an error if the key did not exist * in the database. * * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple * times), then the result of calling SingleDelete() on this key is undefined. * SingleDelete() only behaves correctly if there has been only one Put() * for this key since the previous call to SingleDelete() for this key. * * This feature is currently an experimental performance optimization * for a very specific workload. It is up to the caller to ensure that * SingleDelete is only used for a key that is not deleted using Delete() or * written using Merge(). Mixing SingleDelete operations with Deletes and * Merges can result in undefined behavior. * * Note: consider setting {@link WriteOptions#setSync(boolean)} true. * * @param columnFamilyHandle The column family to delete the key from * @param writeOpt Write options for the delete * @param key Key to delete within database * * @throws RocksDBException thrown if error happens in underlying * native library. */ @Experimental("Performance optimization for a very specific workload") public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpt, final byte[] key) throws RocksDBException { singleDelete(nativeHandle_, writeOpt.nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); } /** * Removes the database entries in the range ["beginKey", "endKey"), i.e., * including "beginKey" and excluding "endKey". a non-OK status on error. It * is not an error if no keys exist in the range ["beginKey", "endKey"). * * Delete the database entry (if any) for "key". Returns OK on success, and a * non-OK status on error. It is not an error if "key" did not exist in the * database. * * @param beginKey First key to delete within database (inclusive) * @param endKey Last key to delete within database (exclusive) * * @throws RocksDBException thrown if error happens in underlying native * library. */ public void deleteRange(final byte[] beginKey, final byte[] endKey) throws RocksDBException { deleteRange(nativeHandle_, beginKey, 0, beginKey.length, endKey, 0, endKey.length); } /** * Removes the database entries in the range ["beginKey", "endKey"), i.e., * including "beginKey" and excluding "endKey". a non-OK status on error. It * is not an error if no keys exist in the range ["beginKey", "endKey"). * * Delete the database entry (if any) for "key". Returns OK on success, and a * non-OK status on error. It is not an error if "key" did not exist in the * database. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance * @param beginKey First key to delete within database (inclusive) * @param endKey Last key to delete within database (exclusive) * * @throws RocksDBException thrown if error happens in underlying native * library. */ public void deleteRange(final ColumnFamilyHandle columnFamilyHandle, final byte[] beginKey, final byte[] endKey) throws RocksDBException { deleteRange(nativeHandle_, beginKey, 0, beginKey.length, endKey, 0, endKey.length, columnFamilyHandle.nativeHandle_); } /** * Removes the database entries in the range ["beginKey", "endKey"), i.e., * including "beginKey" and excluding "endKey". a non-OK status on error. It * is not an error if no keys exist in the range ["beginKey", "endKey"). * * Delete the database entry (if any) for "key". Returns OK on success, and a * non-OK status on error. It is not an error if "key" did not exist in the * database. * * @param writeOpt WriteOptions to be used with delete operation * @param beginKey First key to delete within database (inclusive) * @param endKey Last key to delete within database (exclusive) * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void deleteRange(final WriteOptions writeOpt, final byte[] beginKey, final byte[] endKey) throws RocksDBException { deleteRange(nativeHandle_, writeOpt.nativeHandle_, beginKey, 0, beginKey.length, endKey, 0, endKey.length); } /** * Removes the database entries in the range ["beginKey", "endKey"), i.e., * including "beginKey" and excluding "endKey". a non-OK status on error. It * is not an error if no keys exist in the range ["beginKey", "endKey"). * * Delete the database entry (if any) for "key". Returns OK on success, and a * non-OK status on error. It is not an error if "key" did not exist in the * database. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance * @param writeOpt WriteOptions to be used with delete operation * @param beginKey First key to delete within database (included) * @param endKey Last key to delete within database (excluded) * * @throws RocksDBException thrown if error happens in underlying native * library. */ public void deleteRange(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpt, final byte[] beginKey, final byte[] endKey) throws RocksDBException { deleteRange(nativeHandle_, writeOpt.nativeHandle_, beginKey, 0, beginKey.length, endKey, 0, endKey.length, columnFamilyHandle.nativeHandle_); } /** * Add merge operand for key/value pair. * * @param key the specified key to be merged. * @param value the value to be merged with the current value for the * specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void merge(final byte[] key, final byte[] value) throws RocksDBException { merge(nativeHandle_, key, 0, key.length, value, 0, value.length); } /** * Add merge operand for key/value pair. * * @param key the specified key to be merged. * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than ("key".length - offset) * @param value the value to be merged with the current value for the * specified key. * @param vOffset the offset of the "value" array to be used, must be * non-negative and no longer than "key".length * @param vLen the length of the "value" array to be used, must be * non-negative and must be non-negative and no larger than * ("value".length - offset) * * @throws RocksDBException thrown if error happens in underlying * native library. * @throws IndexOutOfBoundsException if an offset or length is out of bounds */ public void merge(final byte[] key, int offset, int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { checkBounds(offset, len, key.length); checkBounds(vOffset, vLen, value.length); merge(nativeHandle_, key, offset, len, value, vOffset, vLen); } /** * Add merge operand for key/value pair in a ColumnFamily. * * @param columnFamilyHandle {@link ColumnFamilyHandle} instance * @param key the specified key to be merged. * @param value the value to be merged with the current value for * the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void merge(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final byte[] value) throws RocksDBException { merge(nativeHandle_, key, 0, key.length, value, 0, value.length, columnFamilyHandle.nativeHandle_); } /** * Add merge operand for key/value pair in a ColumnFamily. * * @param columnFamilyHandle {@link ColumnFamilyHandle} instance * @param key the specified key to be merged. * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than ("key".length - offset) * @param value the value to be merged with the current value for * the specified key. * @param vOffset the offset of the "value" array to be used, must be * non-negative and no longer than "key".length * @param vLen the length of the "value" array to be used, must be * must be non-negative and no larger than ("value".length - offset) * * @throws RocksDBException thrown if error happens in underlying * native library. * @throws IndexOutOfBoundsException if an offset or length is out of bounds */ public void merge(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { checkBounds(offset, len, key.length); checkBounds(vOffset, vLen, value.length); merge(nativeHandle_, key, offset, len, value, vOffset, vLen, columnFamilyHandle.nativeHandle_); } /** * Add merge operand for key/value pair. * * @param writeOpts {@link WriteOptions} for this write. * @param key the specified key to be merged. * @param value the value to be merged with the current value for * the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void merge(final WriteOptions writeOpts, final byte[] key, final byte[] value) throws RocksDBException { merge(nativeHandle_, writeOpts.nativeHandle_, key, 0, key.length, value, 0, value.length); } /** * Add merge operand for key/value pair. * * @param writeOpts {@link WriteOptions} for this write. * @param key the specified key to be merged. * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than ("value".length - offset) * @param value the value to be merged with the current value for * the specified key. * @param vOffset the offset of the "value" array to be used, must be * non-negative and no longer than "key".length * @param vLen the length of the "value" array to be used, must be * non-negative and no larger than ("value".length - offset) * * @throws RocksDBException thrown if error happens in underlying * native library. * @throws IndexOutOfBoundsException if an offset or length is out of bounds */ public void merge(final WriteOptions writeOpts, final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { checkBounds(offset, len, key.length); checkBounds(vOffset, vLen, value.length); merge(nativeHandle_, writeOpts.nativeHandle_, key, offset, len, value, vOffset, vLen); } /** * Delete the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param writeOpt WriteOptions to be used with delete operation * @param key Key to delete within database. It is using position and limit. * Supports direct buffer only. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void delete(final WriteOptions writeOpt, final ByteBuffer key) throws RocksDBException { assert key.isDirect(); deleteDirect(nativeHandle_, writeOpt.nativeHandle_, key, key.position(), key.remaining(), 0); key.position(key.limit()); } /** * Delete the database entry (if any) for "key". Returns OK on * success, and a non-OK status on error. It is not an error if "key" * did not exist in the database. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param writeOpt WriteOptions to be used with delete operation * @param key Key to delete within database. It is using position and limit. * Supports direct buffer only. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void delete(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpt, final ByteBuffer key) throws RocksDBException { assert key.isDirect(); deleteDirect(nativeHandle_, writeOpt.nativeHandle_, key, key.position(), key.remaining(), columnFamilyHandle.nativeHandle_); key.position(key.limit()); } /** * Add merge operand for key/value pair. * * @param columnFamilyHandle {@link ColumnFamilyHandle} instance * @param writeOpts {@link WriteOptions} for this write. * @param key the specified key to be merged. * @param value the value to be merged with the current value for the * specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void merge(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpts, final byte[] key, final byte[] value) throws RocksDBException { merge(nativeHandle_, writeOpts.nativeHandle_, key, 0, key.length, value, 0, value.length, columnFamilyHandle.nativeHandle_); } /** * Add merge operand for key/value pair. * * @param columnFamilyHandle {@link ColumnFamilyHandle} instance * @param writeOpts {@link WriteOptions} for this write. * @param key the specified key to be merged. * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than ("key".length - offset) * @param value the value to be merged with the current value for * the specified key. * @param vOffset the offset of the "value" array to be used, must be * non-negative and no longer than "key".length * @param vLen the length of the "value" array to be used, must be * non-negative and no larger than ("value".length - offset) * * @throws RocksDBException thrown if error happens in underlying * native library. * @throws IndexOutOfBoundsException if an offset or length is out of bounds */ public void merge( final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpts, final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { checkBounds(offset, len, key.length); checkBounds(vOffset, vLen, value.length); merge(nativeHandle_, writeOpts.nativeHandle_, key, offset, len, value, vOffset, vLen, columnFamilyHandle.nativeHandle_); } /** * Apply the specified updates to the database. * * @param writeOpts WriteOptions instance * @param updates WriteBatch instance * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void write(final WriteOptions writeOpts, final WriteBatch updates) throws RocksDBException { write0(nativeHandle_, writeOpts.nativeHandle_, updates.nativeHandle_); } /** * Apply the specified updates to the database. * * @param writeOpts WriteOptions instance * @param updates WriteBatchWithIndex instance * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void write(final WriteOptions writeOpts, final WriteBatchWithIndex updates) throws RocksDBException { write1(nativeHandle_, writeOpts.nativeHandle_, updates.nativeHandle_); } // TODO(AR) we should improve the #get() API, returning -1 (RocksDB.NOT_FOUND) is not very nice // when we could communicate better status into, also the C++ code show that -2 could be returned /** * Get the value associated with the specified key within column family* * * @param key the key to retrieve the value. * @param value the out-value to receive the retrieved value. * * @return The size of the actual value that matches the specified * {@code key} in byte. If the return value is greater than the * length of {@code value}, then it indicates that the size of the * input buffer {@code value} is insufficient and partial result will * be returned. RocksDB.NOT_FOUND will be returned if the value not * found. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public int get(final byte[] key, final byte[] value) throws RocksDBException { return get(nativeHandle_, key, 0, key.length, value, 0, value.length); } /** * Get the value associated with the specified key within column family* * * @param key the key to retrieve the value. * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than ("key".length - offset) * @param value the out-value to receive the retrieved value. * @param vOffset the offset of the "value" array to be used, must be * non-negative and no longer than "value".length * @param vLen the length of the "value" array to be used, must be * non-negative and and no larger than ("value".length - offset) * * @return The size of the actual value that matches the specified * {@code key} in byte. If the return value is greater than the * length of {@code value}, then it indicates that the size of the * input buffer {@code value} is insufficient and partial result will * be returned. RocksDB.NOT_FOUND will be returned if the value not * found. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public int get(final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { checkBounds(offset, len, key.length); checkBounds(vOffset, vLen, value.length); return get(nativeHandle_, key, offset, len, value, vOffset, vLen); } /** * Get the value associated with the specified key within column family. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param key the key to retrieve the value. * @param value the out-value to receive the retrieved value. * @return The size of the actual value that matches the specified * {@code key} in byte. If the return value is greater than the * length of {@code value}, then it indicates that the size of the * input buffer {@code value} is insufficient and partial result will * be returned. RocksDB.NOT_FOUND will be returned if the value not * found. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public int get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final byte[] value) throws RocksDBException, IllegalArgumentException { return get(nativeHandle_, key, 0, key.length, value, 0, value.length, columnFamilyHandle.nativeHandle_); } /** * Get the value associated with the specified key within column family. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param key the key to retrieve the value. * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * an no larger than ("key".length - offset) * @param value the out-value to receive the retrieved value. * @param vOffset the offset of the "value" array to be used, must be * non-negative and no longer than "key".length * @param vLen the length of the "value" array to be used, must be * non-negative and no larger than ("value".length - offset) * * @return The size of the actual value that matches the specified * {@code key} in byte. If the return value is greater than the * length of {@code value}, then it indicates that the size of the * input buffer {@code value} is insufficient and partial result will * be returned. RocksDB.NOT_FOUND will be returned if the value not * found. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public int get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException, IllegalArgumentException { checkBounds(offset, len, key.length); checkBounds(vOffset, vLen, value.length); return get(nativeHandle_, key, offset, len, value, vOffset, vLen, columnFamilyHandle.nativeHandle_); } /** * Get the value associated with the specified key. * * @param opt {@link org.rocksdb.ReadOptions} instance. * @param key the key to retrieve the value. * @param value the out-value to receive the retrieved value. * @return The size of the actual value that matches the specified * {@code key} in byte. If the return value is greater than the * length of {@code value}, then it indicates that the size of the * input buffer {@code value} is insufficient and partial result will * be returned. RocksDB.NOT_FOUND will be returned if the value not * found. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public int get(final ReadOptions opt, final byte[] key, final byte[] value) throws RocksDBException { return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length, value, 0, value.length); } /** * Get the value associated with the specified key. * * @param opt {@link org.rocksdb.ReadOptions} instance. * @param key the key to retrieve the value. * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than ("key".length - offset) * @param value the out-value to receive the retrieved value. * @param vOffset the offset of the "value" array to be used, must be * non-negative and no longer than "key".length * @param vLen the length of the "value" array to be used, must be * non-negative and no larger than ("value".length - offset) * @return The size of the actual value that matches the specified * {@code key} in byte. If the return value is greater than the * length of {@code value}, then it indicates that the size of the * input buffer {@code value} is insufficient and partial result will * be returned. RocksDB.NOT_FOUND will be returned if the value not * found. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public int get(final ReadOptions opt, final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { checkBounds(offset, len, key.length); checkBounds(vOffset, vLen, value.length); return get(nativeHandle_, opt.nativeHandle_, key, offset, len, value, vOffset, vLen); } /** * Get the value associated with the specified key within column family. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param opt {@link org.rocksdb.ReadOptions} instance. * @param key the key to retrieve the value. * @param value the out-value to receive the retrieved value. * @return The size of the actual value that matches the specified * {@code key} in byte. If the return value is greater than the * length of {@code value}, then it indicates that the size of the * input buffer {@code value} is insufficient and partial result will * be returned. RocksDB.NOT_FOUND will be returned if the value not * found. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public int get(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions opt, final byte[] key, final byte[] value) throws RocksDBException { return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length, value, 0, value.length, columnFamilyHandle.nativeHandle_); } /** * Get the value associated with the specified key within column family. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param opt {@link org.rocksdb.ReadOptions} instance. * @param key the key to retrieve the value. * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be * non-negative and and no larger than ("key".length - offset) * @param value the out-value to receive the retrieved value. * @param vOffset the offset of the "value" array to be used, must be * non-negative and no longer than "key".length * @param vLen the length of the "value" array to be used, and must be * non-negative and no larger than ("value".length - offset) * @return The size of the actual value that matches the specified * {@code key} in byte. If the return value is greater than the * length of {@code value}, then it indicates that the size of the * input buffer {@code value} is insufficient and partial result will * be returned. RocksDB.NOT_FOUND will be returned if the value not * found. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public int get(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions opt, final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { checkBounds(offset, len, key.length); checkBounds(vOffset, vLen, value.length); return get(nativeHandle_, opt.nativeHandle_, key, offset, len, value, vOffset, vLen, columnFamilyHandle.nativeHandle_); } /** * The simplified version of get which returns a new byte array storing * the value associated with the specified input key if any. null will be * returned if the specified key is not found. * * @param key the key retrieve the value. * @return a byte array storing the value associated with the input key if * any. null if it does not find the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public byte[] get(final byte[] key) throws RocksDBException { return get(nativeHandle_, key, 0, key.length); } /** * The simplified version of get which returns a new byte array storing * the value associated with the specified input key if any. null will be * returned if the specified key is not found. * * @param key the key retrieve the value. * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than ("key".length - offset) * @return a byte array storing the value associated with the input key if * any. null if it does not find the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public byte[] get(final byte[] key, final int offset, final int len) throws RocksDBException { checkBounds(offset, len, key.length); return get(nativeHandle_, key, offset, len); } /** * The simplified version of get which returns a new byte array storing * the value associated with the specified input key if any. null will be * returned if the specified key is not found. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param key the key retrieve the value. * @return a byte array storing the value associated with the input key if * any. null if it does not find the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public byte[] get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) throws RocksDBException { // TODO: ------------ start:二次开发代码 --------------- // long start = System.currentTimeMillis(); byte[] state = get(nativeHandle_, key, 0, key.length, columnFamilyHandle.nativeHandle_); this.elapsed(start); // TODO: ------------ end:二次开发代码 --------------- // return state; } /** * The simplified version of get which returns a new byte array storing * the value associated with the specified input key if any. null will be * returned if the specified key is not found. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param key the key retrieve the value. * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than ("key".length - offset) * @return a byte array storing the value associated with the input key if * any. null if it does not find the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public byte[] get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final int offset, final int len) throws RocksDBException { checkBounds(offset, len, key.length); return get(nativeHandle_, key, offset, len, columnFamilyHandle.nativeHandle_); } /** * The simplified version of get which returns a new byte array storing * the value associated with the specified input key if any. null will be * returned if the specified key is not found. * * @param key the key retrieve the value. * @param opt Read options. * @return a byte array storing the value associated with the input key if * any. null if it does not find the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public byte[] get(final ReadOptions opt, final byte[] key) throws RocksDBException { return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length); } /** * The simplified version of get which returns a new byte array storing * the value associated with the specified input key if any. null will be * returned if the specified key is not found. * * @param key the key retrieve the value. * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than ("key".length - offset) * @param opt Read options. * @return a byte array storing the value associated with the input key if * any. null if it does not find the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public byte[] get(final ReadOptions opt, final byte[] key, final int offset, final int len) throws RocksDBException { checkBounds(offset, len, key.length); return get(nativeHandle_, opt.nativeHandle_, key, offset, len); } /** * The simplified version of get which returns a new byte array storing * the value associated with the specified input key if any. null will be * returned if the specified key is not found. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param key the key retrieve the value. * @param opt Read options. * @return a byte array storing the value associated with the input key if * any. null if it does not find the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public byte[] get(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions opt, final byte[] key) throws RocksDBException { return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length, columnFamilyHandle.nativeHandle_); } /** * The simplified version of get which returns a new byte array storing * the value associated with the specified input key if any. null will be * returned if the specified key is not found. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param key the key retrieve the value. * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than ("key".length - offset) * @param opt Read options. * @return a byte array storing the value associated with the input key if * any. null if it does not find the specified key. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public byte[] get(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions opt, final byte[] key, final int offset, final int len) throws RocksDBException { checkBounds(offset, len, key.length); return get(nativeHandle_, opt.nativeHandle_, key, offset, len, columnFamilyHandle.nativeHandle_); } /** * Returns a map of keys for which values were found in DB. * * @param keys List of keys for which values need to be retrieved. * @return Map where key of map is the key passed by user and value for map * entry is the corresponding value in DB. * * @throws RocksDBException thrown if error happens in underlying * native library. * * @deprecated Consider {@link #multiGetAsList(List)} instead. */ @Deprecated public Map multiGet(final List keys) throws RocksDBException { assert(keys.size() != 0); final byte[][] keysArray = keys.toArray(new byte[0][]); final int keyOffsets[] = new int[keysArray.length]; final int keyLengths[] = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } final byte[][] values = multiGet(nativeHandle_, keysArray, keyOffsets, keyLengths); final Map keyValueMap = new HashMap<>(computeCapacityHint(values.length)); for(int i = 0; i < values.length; i++) { if(values[i] == null) { continue; } keyValueMap.put(keys.get(i), values[i]); } return keyValueMap; } /** * Returns a map of keys for which values were found in DB. *

* Note: Every key needs to have a related column family name in * {@code columnFamilyHandleList}. *

* * @param columnFamilyHandleList {@link java.util.List} containing * {@link org.rocksdb.ColumnFamilyHandle} instances. * @param keys List of keys for which values need to be retrieved. * @return Map where key of map is the key passed by user and value for map * entry is the corresponding value in DB. * * @throws RocksDBException thrown if error happens in underlying * native library. * @throws IllegalArgumentException thrown if the size of passed keys is not * equal to the amount of passed column family handles. * * @deprecated Consider {@link #multiGetAsList(List, List)} instead. */ @Deprecated public Map multiGet( final List columnFamilyHandleList, final List keys) throws RocksDBException, IllegalArgumentException { assert(keys.size() != 0); // Check if key size equals cfList size. If not a exception must be // thrown. If not a Segmentation fault happens. if (keys.size() != columnFamilyHandleList.size()) { throw new IllegalArgumentException( "For each key there must be a ColumnFamilyHandle."); } final long[] cfHandles = new long[columnFamilyHandleList.size()]; for (int i = 0; i < columnFamilyHandleList.size(); i++) { cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_; } final byte[][] keysArray = keys.toArray(new byte[0][]); final int keyOffsets[] = new int[keysArray.length]; final int keyLengths[] = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } final byte[][] values = multiGet(nativeHandle_, keysArray, keyOffsets, keyLengths, cfHandles); final Map keyValueMap = new HashMap<>(computeCapacityHint(values.length)); for(int i = 0; i < values.length; i++) { if (values[i] == null) { continue; } keyValueMap.put(keys.get(i), values[i]); } return keyValueMap; } /** * Returns a map of keys for which values were found in DB. * * @param opt Read options. * @param keys of keys for which values need to be retrieved. * @return Map where key of map is the key passed by user and value for map * entry is the corresponding value in DB. * * @throws RocksDBException thrown if error happens in underlying * native library. * * @deprecated Consider {@link #multiGetAsList(ReadOptions, List)} instead. */ @Deprecated public Map multiGet(final ReadOptions opt, final List keys) throws RocksDBException { assert(keys.size() != 0); final byte[][] keysArray = keys.toArray(new byte[0][]); final int keyOffsets[] = new int[keysArray.length]; final int keyLengths[] = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } final byte[][] values = multiGet(nativeHandle_, opt.nativeHandle_, keysArray, keyOffsets, keyLengths); final Map keyValueMap = new HashMap<>(computeCapacityHint(values.length)); for(int i = 0; i < values.length; i++) { if(values[i] == null) { continue; } keyValueMap.put(keys.get(i), values[i]); } return keyValueMap; } /** * Returns a map of keys for which values were found in DB. *

* Note: Every key needs to have a related column family name in * {@code columnFamilyHandleList}. *

* * @param opt Read options. * @param columnFamilyHandleList {@link java.util.List} containing * {@link org.rocksdb.ColumnFamilyHandle} instances. * @param keys of keys for which values need to be retrieved. * @return Map where key of map is the key passed by user and value for map * entry is the corresponding value in DB. * * @throws RocksDBException thrown if error happens in underlying * native library. * @throws IllegalArgumentException thrown if the size of passed keys is not * equal to the amount of passed column family handles. * * @deprecated Consider {@link #multiGetAsList(ReadOptions, List, List)} * instead. */ @Deprecated public Map multiGet(final ReadOptions opt, final List columnFamilyHandleList, final List keys) throws RocksDBException { assert(keys.size() != 0); // Check if key size equals cfList size. If not a exception must be // thrown. If not a Segmentation fault happens. if (keys.size()!=columnFamilyHandleList.size()){ throw new IllegalArgumentException( "For each key there must be a ColumnFamilyHandle."); } final long[] cfHandles = new long[columnFamilyHandleList.size()]; for (int i = 0; i < columnFamilyHandleList.size(); i++) { cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_; } final byte[][] keysArray = keys.toArray(new byte[0][]); final int keyOffsets[] = new int[keysArray.length]; final int keyLengths[] = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } final byte[][] values = multiGet(nativeHandle_, opt.nativeHandle_, keysArray, keyOffsets, keyLengths, cfHandles); final Map keyValueMap = new HashMap<>(computeCapacityHint(values.length)); for(int i = 0; i < values.length; i++) { if(values[i] == null) { continue; } keyValueMap.put(keys.get(i), values[i]); } return keyValueMap; } /** * Takes a list of keys, and returns a list of values for the given list of * keys. List will contain null for keys which could not be found. * * @param keys List of keys for which values need to be retrieved. * @return List of values for the given list of keys. List will contain * null for keys which could not be found. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public List multiGetAsList(final List keys) throws RocksDBException { assert(keys.size() != 0); final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); final int keyOffsets[] = new int[keysArray.length]; final int keyLengths[] = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } return Arrays.asList(multiGet(nativeHandle_, keysArray, keyOffsets, keyLengths)); } /** * Returns a list of values for the given list of keys. List will contain * null for keys which could not be found. *

* Note: Every key needs to have a related column family name in * {@code columnFamilyHandleList}. *

* * @param columnFamilyHandleList {@link java.util.List} containing * {@link org.rocksdb.ColumnFamilyHandle} instances. * @param keys List of keys for which values need to be retrieved. * @return List of values for the given list of keys. List will contain * null for keys which could not be found. * * @throws RocksDBException thrown if error happens in underlying * native library. * @throws IllegalArgumentException thrown if the size of passed keys is not * equal to the amount of passed column family handles. */ public List multiGetAsList( final List columnFamilyHandleList, final List keys) throws RocksDBException, IllegalArgumentException { assert(keys.size() != 0); // Check if key size equals cfList size. If not a exception must be // thrown. If not a Segmentation fault happens. if (keys.size() != columnFamilyHandleList.size()) { throw new IllegalArgumentException( "For each key there must be a ColumnFamilyHandle."); } final long[] cfHandles = new long[columnFamilyHandleList.size()]; for (int i = 0; i < columnFamilyHandleList.size(); i++) { cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_; } final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); final int keyOffsets[] = new int[keysArray.length]; final int keyLengths[] = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } return Arrays.asList(multiGet(nativeHandle_, keysArray, keyOffsets, keyLengths, cfHandles)); } /** * Returns a list of values for the given list of keys. List will contain * null for keys which could not be found. * * @param opt Read options. * @param keys of keys for which values need to be retrieved. * @return List of values for the given list of keys. List will contain * null for keys which could not be found. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public List multiGetAsList(final ReadOptions opt, final List keys) throws RocksDBException { assert(keys.size() != 0); final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); final int keyOffsets[] = new int[keysArray.length]; final int keyLengths[] = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } return Arrays.asList(multiGet(nativeHandle_, opt.nativeHandle_, keysArray, keyOffsets, keyLengths)); } /** * Returns a list of values for the given list of keys. List will contain * null for keys which could not be found. *

* Note: Every key needs to have a related column family name in * {@code columnFamilyHandleList}. *

* * @param opt Read options. * @param columnFamilyHandleList {@link java.util.List} containing * {@link org.rocksdb.ColumnFamilyHandle} instances. * @param keys of keys for which values need to be retrieved. * @return List of values for the given list of keys. List will contain * null for keys which could not be found. * * @throws RocksDBException thrown if error happens in underlying * native library. * @throws IllegalArgumentException thrown if the size of passed keys is not * equal to the amount of passed column family handles. */ public List multiGetAsList(final ReadOptions opt, final List columnFamilyHandleList, final List keys) throws RocksDBException { assert(keys.size() != 0); // Check if key size equals cfList size. If not a exception must be // thrown. If not a Segmentation fault happens. if (keys.size()!=columnFamilyHandleList.size()){ throw new IllegalArgumentException( "For each key there must be a ColumnFamilyHandle."); } final long[] cfHandles = new long[columnFamilyHandleList.size()]; for (int i = 0; i < columnFamilyHandleList.size(); i++) { cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_; } final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); final int keyOffsets[] = new int[keysArray.length]; final int keyLengths[] = new int[keysArray.length]; for(int i = 0; i < keyLengths.length; i++) { keyLengths[i] = keysArray[i].length; } return Arrays.asList(multiGet(nativeHandle_, opt.nativeHandle_, keysArray, keyOffsets, keyLengths, cfHandles)); } /** * If the key definitely does not exist in the database, then this method * returns null, else it returns an instance of KeyMayExistResult * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. * * This check is potentially lighter-weight than invoking * {@link #get(byte[])}. One way to make this lighter weight is to avoid * doing any IOs. * * @param key byte array of a key to search for * @param valueHolder non-null to retrieve the value if it is found, or null * if the value is not needed. If non-null, upon return of the function, * the {@code value} will be set if it could be retrieved. * * @return false if the key definitely does not exist in the database, * otherwise true. */ public boolean keyMayExist(final byte[] key, /* @Nullable */ final Holder valueHolder) { return keyMayExist(key, 0, key.length, valueHolder); } /** * If the key definitely does not exist in the database, then this method * returns null, else it returns an instance of KeyMayExistResult * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. * * This check is potentially lighter-weight than invoking * {@link #get(byte[], int, int)}. One way to make this lighter weight is to * avoid doing any IOs. * * @param key byte array of a key to search for * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than "key".length * @param valueHolder non-null to retrieve the value if it is found, or null * if the value is not needed. If non-null, upon return of the function, * the {@code value} will be set if it could be retrieved. * * @return false if the key definitely does not exist in the database, * otherwise true. */ public boolean keyMayExist(final byte[] key, final int offset, final int len, /* @Nullable */ final Holder valueHolder) { return keyMayExist((ColumnFamilyHandle)null, key, offset, len, valueHolder); } /** * If the key definitely does not exist in the database, then this method * returns null, else it returns an instance of KeyMayExistResult * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. * * This check is potentially lighter-weight than invoking * {@link #get(ColumnFamilyHandle,byte[])}. One way to make this lighter * weight is to avoid doing any IOs. * * @param columnFamilyHandle {@link ColumnFamilyHandle} instance * @param key byte array of a key to search for * @param valueHolder non-null to retrieve the value if it is found, or null * if the value is not needed. If non-null, upon return of the function, * the {@code value} will be set if it could be retrieved. * * @return false if the key definitely does not exist in the database, * otherwise true. */ public boolean keyMayExist( final ColumnFamilyHandle columnFamilyHandle, final byte[] key, /* @Nullable */ final Holder valueHolder) { return keyMayExist(columnFamilyHandle, key, 0, key.length, valueHolder); } /** * If the key definitely does not exist in the database, then this method * returns null, else it returns an instance of KeyMayExistResult * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. * * This check is potentially lighter-weight than invoking * {@link #get(ColumnFamilyHandle, byte[], int, int)}. One way to make this * lighter weight is to avoid doing any IOs. * * @param columnFamilyHandle {@link ColumnFamilyHandle} instance * @param key byte array of a key to search for * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than "key".length * @param valueHolder non-null to retrieve the value if it is found, or null * if the value is not needed. If non-null, upon return of the function, * the {@code value} will be set if it could be retrieved. * * @return false if the key definitely does not exist in the database, * otherwise true. */ public boolean keyMayExist( final ColumnFamilyHandle columnFamilyHandle, final byte[] key, int offset, int len, /* @Nullable */ final Holder valueHolder) { return keyMayExist(columnFamilyHandle, null, key, offset, len, valueHolder); } /** * If the key definitely does not exist in the database, then this method * returns null, else it returns an instance of KeyMayExistResult * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. * * This check is potentially lighter-weight than invoking * {@link #get(ReadOptions, byte[])}. One way to make this * lighter weight is to avoid doing any IOs. * * @param readOptions {@link ReadOptions} instance * @param key byte array of a key to search for * @param valueHolder non-null to retrieve the value if it is found, or null * if the value is not needed. If non-null, upon return of the function, * the {@code value} will be set if it could be retrieved. * * @return false if the key definitely does not exist in the database, * otherwise true. */ public boolean keyMayExist( final ReadOptions readOptions, final byte[] key, /* @Nullable */ final Holder valueHolder) { return keyMayExist(readOptions, key, 0, key.length, valueHolder); } /** * If the key definitely does not exist in the database, then this method * returns null, else it returns an instance of KeyMayExistResult * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. * * This check is potentially lighter-weight than invoking * {@link #get(ReadOptions, byte[], int, int)}. One way to make this * lighter weight is to avoid doing any IOs. * * @param readOptions {@link ReadOptions} instance * @param key byte array of a key to search for * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than "key".length * @param valueHolder non-null to retrieve the value if it is found, or null * if the value is not needed. If non-null, upon return of the function, * the {@code value} will be set if it could be retrieved. * * @return false if the key definitely does not exist in the database, * otherwise true. */ public boolean keyMayExist( final ReadOptions readOptions, final byte[] key, final int offset, final int len, /* @Nullable */ final Holder valueHolder) { return keyMayExist(null, readOptions, key, offset, len, valueHolder); } /** * If the key definitely does not exist in the database, then this method * returns null, else it returns an instance of KeyMayExistResult * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. * * This check is potentially lighter-weight than invoking * {@link #get(ColumnFamilyHandle, ReadOptions, byte[])}. One way to make this * lighter weight is to avoid doing any IOs. * * @param columnFamilyHandle {@link ColumnFamilyHandle} instance * @param readOptions {@link ReadOptions} instance * @param key byte array of a key to search for * @param valueHolder non-null to retrieve the value if it is found, or null * if the value is not needed. If non-null, upon return of the function, * the {@code value} will be set if it could be retrieved. * * @return false if the key definitely does not exist in the database, * otherwise true. */ public boolean keyMayExist( final ColumnFamilyHandle columnFamilyHandle, final ReadOptions readOptions, final byte[] key, /* @Nullable */ final Holder valueHolder) { return keyMayExist(columnFamilyHandle, readOptions, key, 0, key.length, valueHolder); } /** * If the key definitely does not exist in the database, then this method * returns null, else it returns an instance of KeyMayExistResult * * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. * * This check is potentially lighter-weight than invoking * {@link #get(ColumnFamilyHandle, ReadOptions, byte[], int, int)}. * One way to make this lighter weight is to avoid doing any IOs. * * @param columnFamilyHandle {@link ColumnFamilyHandle} instance * @param readOptions {@link ReadOptions} instance * @param key byte array of a key to search for * @param offset the offset of the "key" array to be used, must be * non-negative and no larger than "key".length * @param len the length of the "key" array to be used, must be non-negative * and no larger than "key".length * @param valueHolder non-null to retrieve the value if it is found, or null * if the value is not needed. If non-null, upon return of the function, * the {@code value} will be set if it could be retrieved. * * @return false if the key definitely does not exist in the database, * otherwise true. */ public boolean keyMayExist( final ColumnFamilyHandle columnFamilyHandle, final ReadOptions readOptions, final byte[] key, final int offset, final int len, /* @Nullable */ final Holder valueHolder) { checkBounds(offset, len, key.length); if (valueHolder == null) { return keyMayExist(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, readOptions == null ? 0 : readOptions.nativeHandle_, key, offset, len); } else { final byte[][] result = keyMayExistFoundValue( nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, readOptions == null ? 0 : readOptions.nativeHandle_, key, offset, len); if (result[0][0] == 0x0) { valueHolder.setValue(null); return false; } else if (result[0][0] == 0x1) { valueHolder.setValue(null); return true; } else { valueHolder.setValue(result[1]); return true; } } } /** *

Return a heap-allocated iterator over the contents of the * database. The result of newIterator() is initially invalid * (caller must call one of the Seek methods on the iterator * before using it).

* *

Caller should close the iterator when it is no longer needed. * The returned iterator should be closed before this db is closed. *

* * @return instance of iterator object. */ public RocksIterator newIterator() { return new RocksIterator(this, iterator(nativeHandle_)); } /** *

Return a heap-allocated iterator over the contents of the * database. The result of newIterator() is initially invalid * (caller must call one of the Seek methods on the iterator * before using it).

* *

Caller should close the iterator when it is no longer needed. * The returned iterator should be closed before this db is closed. *

* * @param readOptions {@link ReadOptions} instance. * @return instance of iterator object. */ public RocksIterator newIterator(final ReadOptions readOptions) { return new RocksIterator(this, iterator(nativeHandle_, readOptions.nativeHandle_)); } /** *

Return a heap-allocated iterator over the contents of a * ColumnFamily. The result of newIterator() is initially invalid * (caller must call one of the Seek methods on the iterator * before using it).

* *

Caller should close the iterator when it is no longer needed. * The returned iterator should be closed before this db is closed. *

* * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @return instance of iterator object. */ public RocksIterator newIterator( final ColumnFamilyHandle columnFamilyHandle) { return new RocksIterator(this, iteratorCF(nativeHandle_, columnFamilyHandle.nativeHandle_)); } /** *

Return a heap-allocated iterator over the contents of a * ColumnFamily. The result of newIterator() is initially invalid * (caller must call one of the Seek methods on the iterator * before using it).

* *

Caller should close the iterator when it is no longer needed. * The returned iterator should be closed before this db is closed. *

* * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance * @param readOptions {@link ReadOptions} instance. * @return instance of iterator object. */ public RocksIterator newIterator(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions readOptions) { return new RocksIterator(this, iteratorCF(nativeHandle_, columnFamilyHandle.nativeHandle_, readOptions.nativeHandle_)); } /** * Returns iterators from a consistent database state across multiple * column families. Iterators are heap allocated and need to be deleted * before the db is deleted * * @param columnFamilyHandleList {@link java.util.List} containing * {@link org.rocksdb.ColumnFamilyHandle} instances. * @return {@link java.util.List} containing {@link org.rocksdb.RocksIterator} * instances * * @throws RocksDBException thrown if error happens in underlying * native library. */ public List newIterators( final List columnFamilyHandleList) throws RocksDBException { return newIterators(columnFamilyHandleList, new ReadOptions()); } /** * Returns iterators from a consistent database state across multiple * column families. Iterators are heap allocated and need to be deleted * before the db is deleted * * @param columnFamilyHandleList {@link java.util.List} containing * {@link org.rocksdb.ColumnFamilyHandle} instances. * @param readOptions {@link ReadOptions} instance. * @return {@link java.util.List} containing {@link org.rocksdb.RocksIterator} * instances * * @throws RocksDBException thrown if error happens in underlying * native library. */ public List newIterators( final List columnFamilyHandleList, final ReadOptions readOptions) throws RocksDBException { final long[] columnFamilyHandles = new long[columnFamilyHandleList.size()]; for (int i = 0; i < columnFamilyHandleList.size(); i++) { columnFamilyHandles[i] = columnFamilyHandleList.get(i).nativeHandle_; } final long[] iteratorRefs = iterators(nativeHandle_, columnFamilyHandles, readOptions.nativeHandle_); final List iterators = new ArrayList<>( columnFamilyHandleList.size()); for (int i=0; iReturn a handle to the current DB state. Iterators created with * this handle will all observe a stable snapshot of the current DB * state. The caller must call ReleaseSnapshot(result) when the * snapshot is no longer needed.

* *

nullptr will be returned if the DB fails to take a snapshot or does * not support snapshot.

* * @return Snapshot {@link Snapshot} instance */ public Snapshot getSnapshot() { long snapshotHandle = getSnapshot(nativeHandle_); if (snapshotHandle != 0) { return new Snapshot(snapshotHandle); } return null; } /** * Release a previously acquired snapshot. * * The caller must not use "snapshot" after this call. * * @param snapshot {@link Snapshot} instance */ public void releaseSnapshot(final Snapshot snapshot) { if (snapshot != null) { releaseSnapshot(nativeHandle_, snapshot.nativeHandle_); } } /** * DB implements can export properties about their state * via this method on a per column family level. * *

If {@code property} is a valid property understood by this DB * implementation, fills {@code value} with its current value and * returns true. Otherwise returns false.

* *

Valid property names include: *

    *
  • "rocksdb.num-files-at-level<N>" - return the number of files at * level <N>, where <N> is an ASCII representation of a level * number (e.g. "0").
  • *
  • "rocksdb.stats" - returns a multi-line string that describes statistics * about the internal operation of the DB.
  • *
  • "rocksdb.sstables" - returns a multi-line string that describes all * of the sstables that make up the db contents.
  • *
* * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance, or null for the default column family. * @param property to be fetched. See above for examples * @return property value * * @throws RocksDBException thrown if error happens in underlying * native library. */ public String getProperty( /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle, final String property) throws RocksDBException { return getProperty(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, property, property.length()); } /** * DB implementations can export properties about their state * via this method. If "property" is a valid property understood by this * DB implementation, fills "*value" with its current value and returns * true. Otherwise returns false. * *

Valid property names include: *

    *
  • "rocksdb.num-files-at-level<N>" - return the number of files at * level <N>, where <N> is an ASCII representation of a level * number (e.g. "0").
  • *
  • "rocksdb.stats" - returns a multi-line string that describes statistics * about the internal operation of the DB.
  • *
  • "rocksdb.sstables" - returns a multi-line string that describes all * of the sstables that make up the db contents.
  • *
* * @param property to be fetched. See above for examples * @return property value * * @throws RocksDBException thrown if error happens in underlying * native library. */ public String getProperty(final String property) throws RocksDBException { return getProperty(null, property); } /** * Gets a property map. * * @param property to be fetched. * * @return the property map * * @throws RocksDBException if an error happens in the underlying native code. */ public Map getMapProperty(final String property) throws RocksDBException { return getMapProperty(null, property); } /** * Gets a property map. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance, or null for the default column family. * @param property to be fetched. * * @return the property map * * @throws RocksDBException if an error happens in the underlying native code. */ public Map getMapProperty( /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle, final String property) throws RocksDBException { return getMapProperty(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, property, property.length()); } /** *

Similar to GetProperty(), but only works for a subset of properties * whose return value is a numerical value. Return the value as long.

* *

Note: As the returned property is of type * {@code uint64_t} on C++ side the returning value can be negative * because Java supports in Java 7 only signed long values.

* *

Java 7: To mitigate the problem of the non * existent unsigned long tpye, values should be encapsulated using * {@link java.math.BigInteger} to reflect the correct value. The correct * behavior is guaranteed if {@code 2^64} is added to negative values.

* *

Java 8: In Java 8 the value should be treated as * unsigned long using provided methods of type {@link Long}.

* * @param property to be fetched. * * @return numerical property value. * * @throws RocksDBException if an error happens in the underlying native code. */ public long getLongProperty(final String property) throws RocksDBException { return getLongProperty(null, property); } /** *

Similar to GetProperty(), but only works for a subset of properties * whose return value is a numerical value. Return the value as long.

* *

Note: As the returned property is of type * {@code uint64_t} on C++ side the returning value can be negative * because Java supports in Java 7 only signed long values.

* *

Java 7: To mitigate the problem of the non * existent unsigned long tpye, values should be encapsulated using * {@link java.math.BigInteger} to reflect the correct value. The correct * behavior is guaranteed if {@code 2^64} is added to negative values.

* *

Java 8: In Java 8 the value should be treated as * unsigned long using provided methods of type {@link Long}.

* * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance, or null for the default column family * @param property to be fetched. * * @return numerical property value * * @throws RocksDBException if an error happens in the underlying native code. */ public long getLongProperty( /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle, final String property) throws RocksDBException { return getLongProperty(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, property, property.length()); } /** * Reset internal stats for DB and all column families. * * Note this doesn't reset {@link Options#statistics()} as it is not * owned by DB. * * @throws RocksDBException if an error occurs whilst reseting the stats */ public void resetStats() throws RocksDBException { resetStats(nativeHandle_); } /** *

Return sum of the getLongProperty of all the column families

* *

Note: As the returned property is of type * {@code uint64_t} on C++ side the returning value can be negative * because Java supports in Java 7 only signed long values.

* *

Java 7: To mitigate the problem of the non * existent unsigned long tpye, values should be encapsulated using * {@link java.math.BigInteger} to reflect the correct value. The correct * behavior is guaranteed if {@code 2^64} is added to negative values.

* *

Java 8: In Java 8 the value should be treated as * unsigned long using provided methods of type {@link Long}.

* * @param property to be fetched. * * @return numerical property value * * @throws RocksDBException if an error happens in the underlying native code. */ public long getAggregatedLongProperty(final String property) throws RocksDBException { return getAggregatedLongProperty(nativeHandle_, property, property.length()); } /** * Get the approximate file system space used by keys in each range. * * Note that the returned sizes measure file system space usage, so * if the user data compresses by a factor of ten, the returned * sizes will be one-tenth the size of the corresponding user data size. * * If {@code sizeApproximationFlags} defines whether the returned size * should include the recently written data in the mem-tables (if * the mem-table type supports it), data serialized to disk, or both. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance, or null for the default column family * @param ranges the ranges over which to approximate sizes * @param sizeApproximationFlags flags to determine what to include in the * approximation. * * @return the sizes */ public long[] getApproximateSizes( /*@Nullable*/ final ColumnFamilyHandle columnFamilyHandle, final List ranges, final SizeApproximationFlag... sizeApproximationFlags) { byte flags = 0x0; for (final SizeApproximationFlag sizeApproximationFlag : sizeApproximationFlags) { flags |= sizeApproximationFlag.getValue(); } return getApproximateSizes(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, toRangeSliceHandles(ranges), flags); } /** * Get the approximate file system space used by keys in each range for * the default column family. * * Note that the returned sizes measure file system space usage, so * if the user data compresses by a factor of ten, the returned * sizes will be one-tenth the size of the corresponding user data size. * * If {@code sizeApproximationFlags} defines whether the returned size * should include the recently written data in the mem-tables (if * the mem-table type supports it), data serialized to disk, or both. * * @param ranges the ranges over which to approximate sizes * @param sizeApproximationFlags flags to determine what to include in the * approximation. * * @return the sizes. */ public long[] getApproximateSizes(final List ranges, final SizeApproximationFlag... sizeApproximationFlags) { return getApproximateSizes(null, ranges, sizeApproximationFlags); } public static class CountAndSize { public final long count; public final long size; public CountAndSize(final long count, final long size) { this.count = count; this.size = size; } } /** * This method is similar to * {@link #getApproximateSizes(ColumnFamilyHandle, List, SizeApproximationFlag...)}, * except that it returns approximate number of records and size in memtables. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance, or null for the default column family * @param range the ranges over which to get the memtable stats * * @return the count and size for the range */ public CountAndSize getApproximateMemTableStats( /*@Nullable*/ final ColumnFamilyHandle columnFamilyHandle, final Range range) { final long[] result = getApproximateMemTableStats(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, range.start.getNativeHandle(), range.limit.getNativeHandle()); return new CountAndSize(result[0], result[1]); } /** * This method is similar to * {@link #getApproximateSizes(ColumnFamilyHandle, List, SizeApproximationFlag...)}, * except that it returns approximate number of records and size in memtables. * * @param range the ranges over which to get the memtable stats * * @return the count and size for the range */ public CountAndSize getApproximateMemTableStats( final Range range) { return getApproximateMemTableStats(null, range); } /** *

Range compaction of database.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* *

See also

*
    *
  • {@link #compactRange(boolean, int, int)}
  • *
  • {@link #compactRange(byte[], byte[])}
  • *
  • {@link #compactRange(byte[], byte[], boolean, int, int)}
  • *
* * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ public void compactRange() throws RocksDBException { compactRange(null); } /** *

Range compaction of column family.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* *

See also

*
    *
  • * {@link #compactRange(ColumnFamilyHandle, boolean, int, int)} *
  • *
  • * {@link #compactRange(ColumnFamilyHandle, byte[], byte[])} *
  • *
  • * {@link #compactRange(ColumnFamilyHandle, byte[], byte[], * boolean, int, int)} *
  • *
* * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance, or null for the default column family. * * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ public void compactRange( /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle) throws RocksDBException { compactRange(nativeHandle_, null, -1, null, -1, 0, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_); } /** *

Range compaction of database.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* *

See also

*
    *
  • {@link #compactRange()}
  • *
  • {@link #compactRange(boolean, int, int)}
  • *
  • {@link #compactRange(byte[], byte[], boolean, int, int)}
  • *
* * @param begin start of key range (included in range) * @param end end of key range (excluded from range) * * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ public void compactRange(final byte[] begin, final byte[] end) throws RocksDBException { compactRange(null, begin, end); } /** *

Range compaction of column family.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* *

See also

*
    *
  • {@link #compactRange(ColumnFamilyHandle)}
  • *
  • * {@link #compactRange(ColumnFamilyHandle, boolean, int, int)} *
  • *
  • * {@link #compactRange(ColumnFamilyHandle, byte[], byte[], * boolean, int, int)} *
  • *
* * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance, or null for the default column family. * @param begin start of key range (included in range) * @param end end of key range (excluded from range) * * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ public void compactRange( /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle, final byte[] begin, final byte[] end) throws RocksDBException { compactRange(nativeHandle_, begin, begin == null ? -1 : begin.length, end, end == null ? -1 : end.length, 0, columnFamilyHandle == null ? 0: columnFamilyHandle.nativeHandle_); } /** *

Range compaction of database.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* *

Compaction outputs should be placed in options.db_paths * [target_path_id]. Behavior is undefined if target_path_id is * out of range.

* *

See also

*
    *
  • {@link #compactRange()}
  • *
  • {@link #compactRange(byte[], byte[])}
  • *
  • {@link #compactRange(byte[], byte[], boolean, int, int)}
  • *
* * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead * * @param changeLevel reduce level after compaction * @param targetLevel target level to compact to * @param targetPathId the target path id of output path * * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ @Deprecated public void compactRange(final boolean changeLevel, final int targetLevel, final int targetPathId) throws RocksDBException { compactRange(null, changeLevel, targetLevel, targetPathId); } /** *

Range compaction of column family.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* *

Compaction outputs should be placed in options.db_paths * [target_path_id]. Behavior is undefined if target_path_id is * out of range.

* *

See also

*
    *
  • {@link #compactRange(ColumnFamilyHandle)}
  • *
  • * {@link #compactRange(ColumnFamilyHandle, byte[], byte[])} *
  • *
  • * {@link #compactRange(ColumnFamilyHandle, byte[], byte[], * boolean, int, int)} *
  • *
* * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance, or null for the default column family. * @param changeLevel reduce level after compaction * @param targetLevel target level to compact to * @param targetPathId the target path id of output path * * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ @Deprecated public void compactRange( /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle, final boolean changeLevel, final int targetLevel, final int targetPathId) throws RocksDBException { final CompactRangeOptions options = new CompactRangeOptions(); options.setChangeLevel(changeLevel); options.setTargetLevel(targetLevel); options.setTargetPathId(targetPathId); compactRange(nativeHandle_, null, -1, null, -1, options.nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_); } /** *

Range compaction of database.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* *

Compaction outputs should be placed in options.db_paths * [target_path_id]. Behavior is undefined if target_path_id is * out of range.

* *

See also

*
    *
  • {@link #compactRange()}
  • *
  • {@link #compactRange(boolean, int, int)}
  • *
  • {@link #compactRange(byte[], byte[])}
  • *
* * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} * instead * * @param begin start of key range (included in range) * @param end end of key range (excluded from range) * @param changeLevel reduce level after compaction * @param targetLevel target level to compact to * @param targetPathId the target path id of output path * * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ @Deprecated public void compactRange(final byte[] begin, final byte[] end, final boolean changeLevel, final int targetLevel, final int targetPathId) throws RocksDBException { compactRange(null, begin, end, changeLevel, targetLevel, targetPathId); } /** *

Range compaction of column family.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* *

Compaction outputs should be placed in options.db_paths * [target_path_id]. Behavior is undefined if target_path_id is * out of range.

* *

See also

*
    *
  • {@link #compactRange(ColumnFamilyHandle)}
  • *
  • * {@link #compactRange(ColumnFamilyHandle, boolean, int, int)} *
  • *
  • * {@link #compactRange(ColumnFamilyHandle, byte[], byte[])} *
  • *
* * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance. * @param begin start of key range (included in range) * @param end end of key range (excluded from range) * @param changeLevel reduce level after compaction * @param targetLevel target level to compact to * @param targetPathId the target path id of output path * * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ @Deprecated public void compactRange( /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle, final byte[] begin, final byte[] end, final boolean changeLevel, final int targetLevel, final int targetPathId) throws RocksDBException { final CompactRangeOptions options = new CompactRangeOptions(); options.setChangeLevel(changeLevel); options.setTargetLevel(targetLevel); options.setTargetPathId(targetPathId); compactRange(nativeHandle_, begin, begin == null ? -1 : begin.length, end, end == null ? -1 : end.length, options.nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_); } /** *

Range compaction of column family.

*

Note: After the database has been compacted, * all data will have been pushed down to the last level containing * any data.

* * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance. * @param begin start of key range (included in range) * @param end end of key range (excluded from range) * @param compactRangeOptions options for the compaction * * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ public void compactRange( /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle, final byte[] begin, final byte[] end, final CompactRangeOptions compactRangeOptions) throws RocksDBException { compactRange(nativeHandle_, begin, begin == null ? -1 : begin.length, end, end == null ? -1 : end.length, compactRangeOptions.nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_); } /** * Change the options for the column family handle. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} * instance, or null for the default column family. * @param mutableColumnFamilyOptions the options. * * @throws RocksDBException if an error occurs whilst setting the options */ public void setOptions( /* @Nullable */final ColumnFamilyHandle columnFamilyHandle, final MutableColumnFamilyOptions mutableColumnFamilyOptions) throws RocksDBException { setOptions(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, mutableColumnFamilyOptions.getKeys(), mutableColumnFamilyOptions.getValues()); } /** * Change the options for the default column family handle. * * @param mutableColumnFamilyOptions the options. * * @throws RocksDBException if an error occurs whilst setting the options */ public void setOptions( final MutableColumnFamilyOptions mutableColumnFamilyOptions) throws RocksDBException { setOptions(null, mutableColumnFamilyOptions); } /** * Set the options for the column family handle. * * @param mutableDBoptions the options. * * @throws RocksDBException if an error occurs whilst setting the options */ public void setDBOptions(final MutableDBOptions mutableDBoptions) throws RocksDBException { setDBOptions(nativeHandle_, mutableDBoptions.getKeys(), mutableDBoptions.getValues()); } /** * Takes a list of files specified by file names and * compacts them to the specified level. * * Note that the behavior is different from * {@link #compactRange(ColumnFamilyHandle, byte[], byte[])} * in that CompactFiles() performs the compaction job using the CURRENT * thread. * * @param compactionOptions compaction options * @param inputFileNames the name of the files to compact * @param outputLevel the level to which they should be compacted * @param outputPathId the id of the output path, or -1 * @param compactionJobInfo the compaction job info, this parameter * will be updated with the info from compacting the files, * can just be null if you don't need it. * * @return the list of compacted files * * @throws RocksDBException if an error occurs during compaction */ public List compactFiles( final CompactionOptions compactionOptions, final List inputFileNames, final int outputLevel, final int outputPathId, /* @Nullable */ final CompactionJobInfo compactionJobInfo) throws RocksDBException { return compactFiles(compactionOptions, null, inputFileNames, outputLevel, outputPathId, compactionJobInfo); } /** * Takes a list of files specified by file names and * compacts them to the specified level. * * Note that the behavior is different from * {@link #compactRange(ColumnFamilyHandle, byte[], byte[])} * in that CompactFiles() performs the compaction job using the CURRENT * thread. * * @param compactionOptions compaction options * @param columnFamilyHandle columnFamilyHandle, or null for the * default column family * @param inputFileNames the name of the files to compact * @param outputLevel the level to which they should be compacted * @param outputPathId the id of the output path, or -1 * @param compactionJobInfo the compaction job info, this parameter * will be updated with the info from compacting the files, * can just be null if you don't need it. * * @return the list of compacted files * * @throws RocksDBException if an error occurs during compaction */ public List compactFiles( final CompactionOptions compactionOptions, /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle, final List inputFileNames, final int outputLevel, final int outputPathId, /* @Nullable */ final CompactionJobInfo compactionJobInfo) throws RocksDBException { return Arrays.asList(compactFiles(nativeHandle_, compactionOptions.nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, inputFileNames.toArray(new String[0]), outputLevel, outputPathId, compactionJobInfo == null ? 0 : compactionJobInfo.nativeHandle_)); } /** * This function will cancel all currently running background processes. * * @param wait if true, wait for all background work to be cancelled before * returning. * */ public void cancelAllBackgroundWork(boolean wait) { cancelAllBackgroundWork(nativeHandle_, wait); } /** * This function will wait until all currently running background processes * finish. After it returns, no background process will be run until * {@link #continueBackgroundWork()} is called * * @throws RocksDBException if an error occurs when pausing background work */ public void pauseBackgroundWork() throws RocksDBException { pauseBackgroundWork(nativeHandle_); } /** * Resumes background work which was suspended by * previously calling {@link #pauseBackgroundWork()} * * @throws RocksDBException if an error occurs when resuming background work */ public void continueBackgroundWork() throws RocksDBException { continueBackgroundWork(nativeHandle_); } /** * Enable automatic compactions for the given column * families if they were previously disabled. * * The function will first set the * {@link ColumnFamilyOptions#disableAutoCompactions()} option for each * column family to false, after which it will schedule a flush/compaction. * * NOTE: Setting disableAutoCompactions to 'false' through * {@link #setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)} * does NOT schedule a flush/compaction afterwards, and only changes the * parameter itself within the column family option. * * @param columnFamilyHandles the column family handles * * @throws RocksDBException if an error occurs whilst enabling auto-compaction */ public void enableAutoCompaction( final List columnFamilyHandles) throws RocksDBException { enableAutoCompaction(nativeHandle_, toNativeHandleList(columnFamilyHandles)); } /** * Number of levels used for this DB. * * @return the number of levels */ public int numberLevels() { return numberLevels(null); } /** * Number of levels used for a column family in this DB. * * @param columnFamilyHandle the column family handle, or null * for the default column family * * @return the number of levels */ public int numberLevels(/* @Nullable */final ColumnFamilyHandle columnFamilyHandle) { return numberLevels(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_); } /** * Maximum level to which a new compacted memtable is pushed if it * does not create overlap. * * @return the maximum level */ public int maxMemCompactionLevel() { return maxMemCompactionLevel(null); } /** * Maximum level to which a new compacted memtable is pushed if it * does not create overlap. * * @param columnFamilyHandle the column family handle * * @return the maximum level */ public int maxMemCompactionLevel( /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle) { return maxMemCompactionLevel(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_); } /** * Number of files in level-0 that would stop writes. * * @return the number of files */ public int level0StopWriteTrigger() { return level0StopWriteTrigger(null); } /** * Number of files in level-0 that would stop writes. * * @param columnFamilyHandle the column family handle * * @return the number of files */ public int level0StopWriteTrigger( /* @Nullable */final ColumnFamilyHandle columnFamilyHandle) { return level0StopWriteTrigger(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_); } /** * Get DB name -- the exact same name that was provided as an argument to * as path to {@link #open(Options, String)}. * * @return the DB name */ public String getName() { return getName(nativeHandle_); } /** * Get the Env object from the DB * * @return the env */ public Env getEnv() { final long envHandle = getEnv(nativeHandle_); if (envHandle == Env.getDefault().nativeHandle_) { return Env.getDefault(); } else { final Env env = new RocksEnv(envHandle); env.disOwnNativeHandle(); // we do not own the Env! return env; } } /** *

Flush all memory table data.

* *

Note: it must be ensured that the FlushOptions instance * is not GC'ed before this method finishes. If the wait parameter is * set to false, flush processing is asynchronous.

* * @param flushOptions {@link org.rocksdb.FlushOptions} instance. * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ public void flush(final FlushOptions flushOptions) throws RocksDBException { flush(flushOptions, (List) null); } /** *

Flush all memory table data.

* *

Note: it must be ensured that the FlushOptions instance * is not GC'ed before this method finishes. If the wait parameter is * set to false, flush processing is asynchronous.

* * @param flushOptions {@link org.rocksdb.FlushOptions} instance. * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance. * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ public void flush(final FlushOptions flushOptions, /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle) throws RocksDBException { flush(flushOptions, columnFamilyHandle == null ? null : Arrays.asList(columnFamilyHandle)); } /** * Flushes multiple column families. * * If atomic flush is not enabled, this is equivalent to calling * {@link #flush(FlushOptions, ColumnFamilyHandle)} multiple times. * * If atomic flush is enabled, this will flush all column families * specified up to the latest sequence number at the time when flush is * requested. * * @param flushOptions {@link org.rocksdb.FlushOptions} instance. * @param columnFamilyHandles column family handles. * @throws RocksDBException thrown if an error occurs within the native * part of the library. */ public void flush(final FlushOptions flushOptions, /* @Nullable */ final List columnFamilyHandles) throws RocksDBException { flush(nativeHandle_, flushOptions.nativeHandle_, toNativeHandleList(columnFamilyHandles)); } /** * Flush the WAL memory buffer to the file. If {@code sync} is true, * it calls {@link #syncWal()} afterwards. * * @param sync true to also fsync to disk. * * @throws RocksDBException if an error occurs whilst flushing */ public void flushWal(final boolean sync) throws RocksDBException { flushWal(nativeHandle_, sync); } /** * Sync the WAL. * * Note that {@link #write(WriteOptions, WriteBatch)} followed by * {@link #syncWal()} is not exactly the same as * {@link #write(WriteOptions, WriteBatch)} with * {@link WriteOptions#sync()} set to true; In the latter case the changes * won't be visible until the sync is done. * * Currently only works if {@link Options#allowMmapWrites()} is set to false. * * @throws RocksDBException if an error occurs whilst syncing */ public void syncWal() throws RocksDBException { syncWal(nativeHandle_); } /** *

The sequence number of the most recent transaction.

* * @return sequence number of the most * recent transaction. */ public long getLatestSequenceNumber() { return getLatestSequenceNumber(nativeHandle_); } /** * Instructs DB to preserve deletes with sequence numbers >= sequenceNumber. * * Has no effect if DBOptions#preserveDeletes() is set to false. * * This function assumes that user calls this function with monotonically * increasing seqnums (otherwise we can't guarantee that a particular delete * hasn't been already processed). * * @param sequenceNumber the minimum sequence number to preserve * * @return true if the value was successfully updated, * false if user attempted to call if with * sequenceNumber <= current value. */ public boolean setPreserveDeletesSequenceNumber(final long sequenceNumber) { return setPreserveDeletesSequenceNumber(nativeHandle_, sequenceNumber); } /** *

Prevent file deletions. Compactions will continue to occur, * but no obsolete files will be deleted. Calling this multiple * times have the same effect as calling it once.

* * @throws RocksDBException thrown if operation was not performed * successfully. */ public void disableFileDeletions() throws RocksDBException { disableFileDeletions(nativeHandle_); } /** *

Allow compactions to delete obsolete files. * If force == true, the call to EnableFileDeletions() * will guarantee that file deletions are enabled after * the call, even if DisableFileDeletions() was called * multiple times before.

* *

If force == false, EnableFileDeletions will only * enable file deletion after it's been called at least * as many times as DisableFileDeletions(), enabling * the two methods to be called by two threads * concurrently without synchronization * -- i.e., file deletions will be enabled only after both * threads call EnableFileDeletions()

* * @param force boolean value described above. * * @throws RocksDBException thrown if operation was not performed * successfully. */ public void enableFileDeletions(final boolean force) throws RocksDBException { enableFileDeletions(nativeHandle_, force); } public static class LiveFiles { /** * The valid size of the manifest file. The manifest file is an ever growing * file, but only the portion specified here is valid for this snapshot. */ public final long manifestFileSize; /** * The files are relative to the {@link #getName()} and are not * absolute paths. Despite being relative paths, the file names begin * with "/". */ public final List files; LiveFiles(final long manifestFileSize, final List files) { this.manifestFileSize = manifestFileSize; this.files = files; } } /** * Retrieve the list of all files in the database after flushing the memtable. * * See {@link #getLiveFiles(boolean)}. * * @return the live files * * @throws RocksDBException if an error occurs whilst retrieving the list * of live files */ public LiveFiles getLiveFiles() throws RocksDBException { return getLiveFiles(true); } /** * Retrieve the list of all files in the database. * * In case you have multiple column families, even if {@code flushMemtable} * is true, you still need to call {@link #getSortedWalFiles()} * after {@link #getLiveFiles(boolean)} to compensate for new data that * arrived to already-flushed column families while other column families * were flushing. * * NOTE: Calling {@link #getLiveFiles(boolean)} followed by * {@link #getSortedWalFiles()} can generate a lossless backup. * * @param flushMemtable set to true to flush before recoding the live * files. Setting to false is useful when we don't want to wait for flush * which may have to wait for compaction to complete taking an * indeterminate time. * * @return the live files * * @throws RocksDBException if an error occurs whilst retrieving the list * of live files */ public LiveFiles getLiveFiles(final boolean flushMemtable) throws RocksDBException { final String[] result = getLiveFiles(nativeHandle_, flushMemtable); if (result == null) { return null; } final String[] files = Arrays.copyOf(result, result.length - 1); final long manifestFileSize = Long.parseLong(result[result.length - 1]); return new LiveFiles(manifestFileSize, Arrays.asList(files)); } /** * Retrieve the sorted list of all wal files with earliest file first. * * @return the log files * * @throws RocksDBException if an error occurs whilst retrieving the list * of sorted WAL files */ public List getSortedWalFiles() throws RocksDBException { final LogFile[] logFiles = getSortedWalFiles(nativeHandle_); return Arrays.asList(logFiles); } /** *

Returns an iterator that is positioned at a write-batch containing * seq_number. If the sequence number is non existent, it returns an iterator * at the first available seq_no after the requested seq_no.

* *

Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to * use this api, else the WAL files will get * cleared aggressively and the iterator might keep getting invalid before * an update is read.

* * @param sequenceNumber sequence number offset * * @return {@link org.rocksdb.TransactionLogIterator} instance. * * @throws org.rocksdb.RocksDBException if iterator cannot be retrieved * from native-side. */ public TransactionLogIterator getUpdatesSince(final long sequenceNumber) throws RocksDBException { return new TransactionLogIterator( getUpdatesSince(nativeHandle_, sequenceNumber)); } /** * Delete the file name from the db directory and update the internal state to * reflect that. Supports deletion of sst and log files only. 'name' must be * path relative to the db directory. eg. 000001.sst, /archive/000003.log * * @param name the file name * * @throws RocksDBException if an error occurs whilst deleting the file */ public void deleteFile(final String name) throws RocksDBException { deleteFile(nativeHandle_, name); } /** * Gets a list of all table files metadata. * * @return table files metadata. */ public List getLiveFilesMetaData() { return Arrays.asList(getLiveFilesMetaData(nativeHandle_)); } /** * Obtains the meta data of the specified column family of the DB. * * @param columnFamilyHandle the column family * * @return the column family metadata */ public ColumnFamilyMetaData getColumnFamilyMetaData( /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle) { return getColumnFamilyMetaData(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_); } /** * Obtains the meta data of the default column family of the DB. * * @return the column family metadata */ public ColumnFamilyMetaData getColumnFamilyMetaData() { return getColumnFamilyMetaData(null); } /** * ingestExternalFile will load a list of external SST files (1) into the DB * We will try to find the lowest possible level that the file can fit in, and * ingest the file into this level (2). A file that have a key range that * overlap with the memtable key range will require us to Flush the memtable * first before ingesting the file. * * (1) External SST files can be created using {@link SstFileWriter} * (2) We will try to ingest the files to the lowest possible level * even if the file compression doesn't match the level compression * * @param filePathList The list of files to ingest * @param ingestExternalFileOptions the options for the ingestion * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void ingestExternalFile(final List filePathList, final IngestExternalFileOptions ingestExternalFileOptions) throws RocksDBException { ingestExternalFile(nativeHandle_, getDefaultColumnFamily().nativeHandle_, filePathList.toArray(new String[0]), filePathList.size(), ingestExternalFileOptions.nativeHandle_); } /** * ingestExternalFile will load a list of external SST files (1) into the DB * We will try to find the lowest possible level that the file can fit in, and * ingest the file into this level (2). A file that have a key range that * overlap with the memtable key range will require us to Flush the memtable * first before ingesting the file. * * (1) External SST files can be created using {@link SstFileWriter} * (2) We will try to ingest the files to the lowest possible level * even if the file compression doesn't match the level compression * * @param columnFamilyHandle The column family for the ingested files * @param filePathList The list of files to ingest * @param ingestExternalFileOptions the options for the ingestion * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void ingestExternalFile(final ColumnFamilyHandle columnFamilyHandle, final List filePathList, final IngestExternalFileOptions ingestExternalFileOptions) throws RocksDBException { ingestExternalFile(nativeHandle_, columnFamilyHandle.nativeHandle_, filePathList.toArray(new String[0]), filePathList.size(), ingestExternalFileOptions.nativeHandle_); } /** * Verify checksum * * @throws RocksDBException if the checksum is not valid */ public void verifyChecksum() throws RocksDBException { verifyChecksum(nativeHandle_); } /** * Gets the handle for the default column family * * @return The handle of the default column family */ public ColumnFamilyHandle getDefaultColumnFamily() { final ColumnFamilyHandle cfHandle = new ColumnFamilyHandle(this, getDefaultColumnFamily(nativeHandle_)); cfHandle.disOwnNativeHandle(); return cfHandle; } /** * Get the properties of all tables. * * @param columnFamilyHandle the column family handle, or null for the default * column family. * * @return the properties * * @throws RocksDBException if an error occurs whilst getting the properties */ public Map getPropertiesOfAllTables( /* @Nullable */final ColumnFamilyHandle columnFamilyHandle) throws RocksDBException { return getPropertiesOfAllTables(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_); } /** * Get the properties of all tables in the default column family. * * @return the properties * * @throws RocksDBException if an error occurs whilst getting the properties */ public Map getPropertiesOfAllTables() throws RocksDBException { return getPropertiesOfAllTables(null); } /** * Get the properties of tables in range. * * @param columnFamilyHandle the column family handle, or null for the default * column family. * @param ranges the ranges over which to get the table properties * * @return the properties * * @throws RocksDBException if an error occurs whilst getting the properties */ public Map getPropertiesOfTablesInRange( /* @Nullable */final ColumnFamilyHandle columnFamilyHandle, final List ranges) throws RocksDBException { return getPropertiesOfTablesInRange(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, toRangeSliceHandles(ranges)); } /** * Get the properties of tables in range for the default column family. * * @param ranges the ranges over which to get the table properties * * @return the properties * * @throws RocksDBException if an error occurs whilst getting the properties */ public Map getPropertiesOfTablesInRange( final List ranges) throws RocksDBException { return getPropertiesOfTablesInRange(null, ranges); } /** * Suggest the range to compact. * * @param columnFamilyHandle the column family handle, or null for the default * column family. * * @return the suggested range. * * @throws RocksDBException if an error occurs whilst suggesting the range */ public Range suggestCompactRange( /* @Nullable */final ColumnFamilyHandle columnFamilyHandle) throws RocksDBException { final long[] rangeSliceHandles = suggestCompactRange(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_); return new Range(new Slice(rangeSliceHandles[0]), new Slice(rangeSliceHandles[1])); } /** * Suggest the range to compact for the default column family. * * @return the suggested range. * * @throws RocksDBException if an error occurs whilst suggesting the range */ public Range suggestCompactRange() throws RocksDBException { return suggestCompactRange(null); } /** * Promote L0. * * @param columnFamilyHandle the column family handle, * or null for the default column family. * @param targetLevel the target level for L0 * * @throws RocksDBException if an error occurs whilst promoting L0 */ public void promoteL0( /* @Nullable */final ColumnFamilyHandle columnFamilyHandle, final int targetLevel) throws RocksDBException { promoteL0(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, targetLevel); } /** * Promote L0 for the default column family. * * @param targetLevel the target level for L0 * * @throws RocksDBException if an error occurs whilst promoting L0 */ public void promoteL0(final int targetLevel) throws RocksDBException { promoteL0(null, targetLevel); } /** * Trace DB operations. * * Use {@link #endTrace()} to stop tracing. * * @param traceOptions the options * @param traceWriter the trace writer * * @throws RocksDBException if an error occurs whilst starting the trace */ public void startTrace(final TraceOptions traceOptions, final AbstractTraceWriter traceWriter) throws RocksDBException { startTrace(nativeHandle_, traceOptions.getMaxTraceFileSize(), traceWriter.nativeHandle_); /** * NOTE: {@link #startTrace(long, long, long) transfers the ownership * from Java to C++, so we must disown the native handle here. */ traceWriter.disOwnNativeHandle(); } /** * Stop tracing DB operations. * * See {@link #startTrace(TraceOptions, AbstractTraceWriter)} * * @throws RocksDBException if an error occurs whilst ending the trace */ public void endTrace() throws RocksDBException { endTrace(nativeHandle_); } /** * Make the secondary instance catch up with the primary by tailing and * replaying the MANIFEST and WAL of the primary. * Column families created by the primary after the secondary instance starts * will be ignored unless the secondary instance closes and restarts with the * newly created column families. * Column families that exist before secondary instance starts and dropped by * the primary afterwards will be marked as dropped. However, as long as the * secondary instance does not delete the corresponding column family * handles, the data of the column family is still accessible to the * secondary. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void tryCatchUpWithPrimary() throws RocksDBException { tryCatchUpWithPrimary(nativeHandle_); } /** * Delete files in multiple ranges at once. * Delete files in a lot of ranges one at a time can be slow, use this API for * better performance in that case. * * @param columnFamily - The column family for operation (null for default) * @param includeEnd - Whether ranges should include end * @param ranges - pairs of ranges (from1, to1, from2, to2, ...) * * @throws RocksDBException thrown if error happens in underlying * native library. */ public void deleteFilesInRanges(final ColumnFamilyHandle columnFamily, final List ranges, final boolean includeEnd) throws RocksDBException { if (ranges.size() == 0) { return; } if ((ranges.size() % 2) != 0) { throw new IllegalArgumentException("Ranges size needs to be multiple of 2 " + "(from1, to1, from2, to2, ...), but is " + ranges.size()); } final byte[][] rangesArray = ranges.toArray(new byte[ranges.size()][]); deleteFilesInRanges(nativeHandle_, columnFamily == null ? 0 : columnFamily.nativeHandle_, rangesArray, includeEnd); } /** * Static method to destroy the contents of the specified database. * Be very careful using this method. * * @param path the path to the Rocksdb database. * @param options {@link org.rocksdb.Options} instance. * * @throws RocksDBException thrown if error happens in underlying * native library. */ public static void destroyDB(final String path, final Options options) throws RocksDBException { destroyDB(path, options.nativeHandle_); } private /* @Nullable */ long[] toNativeHandleList( /* @Nullable */ final List objectList) { if (objectList == null) { return null; } final int len = objectList.size(); final long[] handleList = new long[len]; for (int i = 0; i < len; i++) { handleList[i] = objectList.get(i).nativeHandle_; } return handleList; } private static long[] toRangeSliceHandles(final List ranges) { final long rangeSliceHandles[] = new long [ranges.size() * 2]; for (int i = 0, j = 0; i < ranges.size(); i++) { final Range range = ranges.get(i); rangeSliceHandles[j++] = range.start.getNativeHandle(); rangeSliceHandles[j++] = range.limit.getNativeHandle(); } return rangeSliceHandles; } protected void storeOptionsInstance(DBOptionsInterface options) { options_ = options; } private static void checkBounds(int offset, int len, int size) { if ((offset | len | (offset + len) | (size - (offset + len))) < 0) { throw new IndexOutOfBoundsException(String.format("offset(%d), len(%d), size(%d)", offset, len, size)); } } private static int computeCapacityHint(final int estimatedNumberOfItems) { // Default load factor for HashMap is 0.75, so N * 1.5 will be at the load // limit. We add +1 for a buffer. return (int)Math.ceil(estimatedNumberOfItems * 1.5 + 1.0); } // native methods private native static long open(final long optionsHandle, final String path) throws RocksDBException; /** * @param optionsHandle Native handle pointing to an Options object * @param path The directory path for the database files * @param columnFamilyNames An array of column family names * @param columnFamilyOptions An array of native handles pointing to * ColumnFamilyOptions objects * * @return An array of native handles, [0] is the handle of the RocksDB object * [1..1+n] are handles of the ColumnFamilyReferences * * @throws RocksDBException thrown if the database could not be opened */ private native static long[] open(final long optionsHandle, final String path, final byte[][] columnFamilyNames, final long[] columnFamilyOptions) throws RocksDBException; private native static long openROnly(final long optionsHandle, final String path, final boolean errorIfWalFileExists) throws RocksDBException; /** * @param optionsHandle Native handle pointing to an Options object * @param path The directory path for the database files * @param columnFamilyNames An array of column family names * @param columnFamilyOptions An array of native handles pointing to * ColumnFamilyOptions objects * * @return An array of native handles, [0] is the handle of the RocksDB object * [1..1+n] are handles of the ColumnFamilyReferences * * @throws RocksDBException thrown if the database could not be opened */ private native static long[] openROnly(final long optionsHandle, final String path, final byte[][] columnFamilyNames, final long[] columnFamilyOptions, final boolean errorIfWalFileExists) throws RocksDBException; private native static long openAsSecondary(final long optionsHandle, final String path, final String secondaryPath) throws RocksDBException; private native static long[] openAsSecondary(final long optionsHandle, final String path, final String secondaryPath, final byte[][] columnFamilyNames, final long[] columnFamilyOptions) throws RocksDBException; @Override protected native void disposeInternal(final long handle); private native static void closeDatabase(final long handle) throws RocksDBException; private native static byte[][] listColumnFamilies(final long optionsHandle, final String path) throws RocksDBException; private native long createColumnFamily(final long handle, final byte[] columnFamilyName, final int columnFamilyNamelen, final long columnFamilyOptions) throws RocksDBException; private native long[] createColumnFamilies(final long handle, final long columnFamilyOptionsHandle, final byte[][] columnFamilyNames) throws RocksDBException; private native long[] createColumnFamilies(final long handle, final long columnFamilyOptionsHandles[], final byte[][] columnFamilyNames) throws RocksDBException; private native void dropColumnFamily( final long handle, final long cfHandle) throws RocksDBException; private native void dropColumnFamilies(final long handle, final long[] cfHandles) throws RocksDBException; private native void put(final long handle, final byte[] key, final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, int valueLength) throws RocksDBException; private native void put(final long handle, final byte[] key, final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, final int valueLength, final long cfHandle) throws RocksDBException; private native void put(final long handle, final long writeOptHandle, final byte[] key, final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, final int valueLength) throws RocksDBException; private native void put(final long handle, final long writeOptHandle, final byte[] key, final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, final int valueLength, final long cfHandle) throws RocksDBException; private native void delete(final long handle, final byte[] key, final int keyOffset, final int keyLength) throws RocksDBException; private native void delete(final long handle, final byte[] key, final int keyOffset, final int keyLength, final long cfHandle) throws RocksDBException; private native void delete(final long handle, final long writeOptHandle, final byte[] key, final int keyOffset, final int keyLength) throws RocksDBException; private native void delete(final long handle, final long writeOptHandle, final byte[] key, final int keyOffset, final int keyLength, final long cfHandle) throws RocksDBException; private native void singleDelete( final long handle, final byte[] key, final int keyLen) throws RocksDBException; private native void singleDelete( final long handle, final byte[] key, final int keyLen, final long cfHandle) throws RocksDBException; private native void singleDelete( final long handle, final long writeOptHandle, final byte[] key, final int keyLen) throws RocksDBException; private native void singleDelete( final long handle, final long writeOptHandle, final byte[] key, final int keyLen, final long cfHandle) throws RocksDBException; private native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyOffset, final int beginKeyLength, final byte[] endKey, final int endKeyOffset, final int endKeyLength) throws RocksDBException; private native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyOffset, final int beginKeyLength, final byte[] endKey, final int endKeyOffset, final int endKeyLength, final long cfHandle) throws RocksDBException; private native void deleteRange(final long handle, final long writeOptHandle, final byte[] beginKey, final int beginKeyOffset, final int beginKeyLength, final byte[] endKey, final int endKeyOffset, final int endKeyLength) throws RocksDBException; private native void deleteRange( final long handle, final long writeOptHandle, final byte[] beginKey, final int beginKeyOffset, final int beginKeyLength, final byte[] endKey, final int endKeyOffset, final int endKeyLength, final long cfHandle) throws RocksDBException; private native void merge(final long handle, final byte[] key, final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, final int valueLength) throws RocksDBException; private native void merge(final long handle, final byte[] key, final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, final int valueLength, final long cfHandle) throws RocksDBException; private native void merge(final long handle, final long writeOptHandle, final byte[] key, final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, final int valueLength) throws RocksDBException; private native void merge(final long handle, final long writeOptHandle, final byte[] key, final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, final int valueLength, final long cfHandle) throws RocksDBException; private native void write0(final long handle, final long writeOptHandle, final long wbHandle) throws RocksDBException; private native void write1(final long handle, final long writeOptHandle, final long wbwiHandle) throws RocksDBException; private native int get(final long handle, final byte[] key, final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, final int valueLength) throws RocksDBException; private native int get(final long handle, final byte[] key, final int keyOffset, final int keyLength, byte[] value, final int valueOffset, final int valueLength, final long cfHandle) throws RocksDBException; private native int get(final long handle, final long readOptHandle, final byte[] key, final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, final int valueLength) throws RocksDBException; private native int get(final long handle, final long readOptHandle, final byte[] key, final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, final int valueLength, final long cfHandle) throws RocksDBException; private native byte[] get(final long handle, byte[] key, final int keyOffset, final int keyLength) throws RocksDBException; private native byte[] get(final long handle, final byte[] key, final int keyOffset, final int keyLength, final long cfHandle) throws RocksDBException; private native byte[] get(final long handle, final long readOptHandle, final byte[] key, final int keyOffset, final int keyLength) throws RocksDBException; private native byte[] get(final long handle, final long readOptHandle, final byte[] key, final int keyOffset, final int keyLength, final long cfHandle) throws RocksDBException; private native byte[][] multiGet(final long dbHandle, final byte[][] keys, final int[] keyOffsets, final int[] keyLengths); private native byte[][] multiGet(final long dbHandle, final byte[][] keys, final int[] keyOffsets, final int[] keyLengths, final long[] columnFamilyHandles); private native byte[][] multiGet(final long dbHandle, final long rOptHandle, final byte[][] keys, final int[] keyOffsets, final int[] keyLengths); private native byte[][] multiGet(final long dbHandle, final long rOptHandle, final byte[][] keys, final int[] keyOffsets, final int[] keyLengths, final long[] columnFamilyHandles); private native boolean keyMayExist( final long handle, final long cfHandle, final long readOptHandle, final byte[] key, final int keyOffset, final int keyLength); private native byte[][] keyMayExistFoundValue( final long handle, final long cfHandle, final long readOptHandle, final byte[] key, final int keyOffset, final int keyLength); private native void putDirect(long handle, long writeOptHandle, ByteBuffer key, int keyOffset, int keyLength, ByteBuffer value, int valueOffset, int valueLength, long cfHandle) throws RocksDBException; private native long iterator(final long handle); private native long iterator(final long handle, final long readOptHandle); private native long iteratorCF(final long handle, final long cfHandle); private native long iteratorCF(final long handle, final long cfHandle, final long readOptHandle); private native long[] iterators(final long handle, final long[] columnFamilyHandles, final long readOptHandle) throws RocksDBException; private native long getSnapshot(final long nativeHandle); private native void releaseSnapshot( final long nativeHandle, final long snapshotHandle); private native String getProperty(final long nativeHandle, final long cfHandle, final String property, final int propertyLength) throws RocksDBException; private native Map getMapProperty(final long nativeHandle, final long cfHandle, final String property, final int propertyLength) throws RocksDBException; private native int getDirect(long handle, long readOptHandle, ByteBuffer key, int keyOffset, int keyLength, ByteBuffer value, int valueOffset, int valueLength, long cfHandle) throws RocksDBException; private native void deleteDirect(long handle, long optHandle, ByteBuffer key, int keyOffset, int keyLength, long cfHandle) throws RocksDBException; private native long getLongProperty(final long nativeHandle, final long cfHandle, final String property, final int propertyLength) throws RocksDBException; private native void resetStats(final long nativeHandle) throws RocksDBException; private native long getAggregatedLongProperty(final long nativeHandle, final String property, int propertyLength) throws RocksDBException; private native long[] getApproximateSizes(final long nativeHandle, final long columnFamilyHandle, final long[] rangeSliceHandles, final byte includeFlags); private final native long[] getApproximateMemTableStats( final long nativeHandle, final long columnFamilyHandle, final long rangeStartSliceHandle, final long rangeLimitSliceHandle); private native void compactRange(final long handle, /* @Nullable */ final byte[] begin, final int beginLen, /* @Nullable */ final byte[] end, final int endLen, final long compactRangeOptHandle, final long cfHandle) throws RocksDBException; private native void setOptions(final long handle, final long cfHandle, final String[] keys, final String[] values) throws RocksDBException; private native void setDBOptions(final long handle, final String[] keys, final String[] values) throws RocksDBException; private native String[] compactFiles(final long handle, final long compactionOptionsHandle, final long columnFamilyHandle, final String[] inputFileNames, final int outputLevel, final int outputPathId, final long compactionJobInfoHandle) throws RocksDBException; private native void cancelAllBackgroundWork(final long handle, final boolean wait); private native void pauseBackgroundWork(final long handle) throws RocksDBException; private native void continueBackgroundWork(final long handle) throws RocksDBException; private native void enableAutoCompaction(final long handle, final long[] columnFamilyHandles) throws RocksDBException; private native int numberLevels(final long handle, final long columnFamilyHandle); private native int maxMemCompactionLevel(final long handle, final long columnFamilyHandle); private native int level0StopWriteTrigger(final long handle, final long columnFamilyHandle); private native String getName(final long handle); private native long getEnv(final long handle); private native void flush(final long handle, final long flushOptHandle, /* @Nullable */ final long[] cfHandles) throws RocksDBException; private native void flushWal(final long handle, final boolean sync) throws RocksDBException; private native void syncWal(final long handle) throws RocksDBException; private native long getLatestSequenceNumber(final long handle); private native boolean setPreserveDeletesSequenceNumber(final long handle, final long sequenceNumber); private native void disableFileDeletions(long handle) throws RocksDBException; private native void enableFileDeletions(long handle, boolean force) throws RocksDBException; private native String[] getLiveFiles(final long handle, final boolean flushMemtable) throws RocksDBException; private native LogFile[] getSortedWalFiles(final long handle) throws RocksDBException; private native long getUpdatesSince(final long handle, final long sequenceNumber) throws RocksDBException; private native void deleteFile(final long handle, final String name) throws RocksDBException; private native LiveFileMetaData[] getLiveFilesMetaData(final long handle); private native ColumnFamilyMetaData getColumnFamilyMetaData( final long handle, final long columnFamilyHandle); private native void ingestExternalFile(final long handle, final long columnFamilyHandle, final String[] filePathList, final int filePathListLen, final long ingestExternalFileOptionsHandle) throws RocksDBException; private native void verifyChecksum(final long handle) throws RocksDBException; private native long getDefaultColumnFamily(final long handle); private native Map getPropertiesOfAllTables( final long handle, final long columnFamilyHandle) throws RocksDBException; private native Map getPropertiesOfTablesInRange( final long handle, final long columnFamilyHandle, final long[] rangeSliceHandles); private native long[] suggestCompactRange(final long handle, final long columnFamilyHandle) throws RocksDBException; private native void promoteL0(final long handle, final long columnFamilyHandle, final int tragetLevel) throws RocksDBException; private native void startTrace(final long handle, final long maxTraceFileSize, final long traceWriterHandle) throws RocksDBException; private native void endTrace(final long handle) throws RocksDBException; private native void tryCatchUpWithPrimary(final long handle) throws RocksDBException; private native void deleteFilesInRanges(long handle, long cfHandle, final byte[][] ranges, boolean include_end) throws RocksDBException; private native static void destroyDB(final String path, final long optionsHandle) throws RocksDBException; private native static int version(); protected DBOptionsInterface options_; private static Version version; public static class Version { private final byte major; private final byte minor; private final byte patch; public Version(final byte major, final byte minor, final byte patch) { this.major = major; this.minor = minor; this.patch = patch; } public int getMajor() { return major; } public int getMinor() { return minor; } public int getPatch() { return patch; } @Override public String toString() { return getMajor() + "." + getMinor() + "." + getPatch(); } private static Version fromEncodedVersion(int encodedVersion) { final byte patch = (byte) (encodedVersion & 0xff); encodedVersion >>= 8; final byte minor = (byte) (encodedVersion & 0xff); encodedVersion >>= 8; final byte major = (byte) (encodedVersion & 0xff); return new Version(major, minor, patch); } } } ================================================ FILE: fire-enhance/apache-spark/pom.xml ================================================ 4.0.0 fire-enhance-spark_${spark.reference} 2.3.2-SNAPSHOT jar Fire : Enhance : Spark com.zto.fire fire-enhance 2.3.2-SNAPSHOT ../pom.xml org.apache.spark spark-core_${scala.binary.version} com.esotericsoftware.kryo kryo ${spark.version} ${maven.scope} org.apache.spark spark-sql_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.spark spark-streaming_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.spark spark-sql-kafka-0-10_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.spark spark-streaming_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.spark spark-streaming-kafka-0-10_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.hadoop hadoop-common ${hadoop.version} ${maven.scope} org.apache.hadoop hadoop-hdfs ${hadoop.version} ${maven.scope} org.apache.hadoop hadoop-client ${hadoop.version} ${maven.scope} org.apache.hbase hbase-common ${hbase.version} ${maven.scope} org.apache.hbase hbase-client org.apache.hbase hbase-server ${hbase.version} ${maven.scope} org.apache.hbase hbase-client org.apache.hbase hbase-client ${hbase.version} org.apache.rocketmq rocketmq-client ${rocketmq.version} ${maven.scope} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-enhance/apache-spark/src/main/scala-spark-3.0/org/apache/spark/internal/config/Streaming.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.internal.config import java.util.concurrent.TimeUnit private[spark] object Streaming { private[spark] val STREAMING_DYN_ALLOCATION_ENABLED = ConfigBuilder("spark.streaming.dynamicAllocation.enabled") .version("3.0.0") .booleanConf .createWithDefault(false) private[spark] val STREAMING_DYN_ALLOCATION_TESTING = ConfigBuilder("spark.streaming.dynamicAllocation.testing") .version("3.0.0") .booleanConf .createWithDefault(false) private[spark] val STREAMING_DYN_ALLOCATION_MIN_EXECUTORS = ConfigBuilder("spark.streaming.dynamicAllocation.minExecutors") .version("3.0.0") .intConf .checkValue(_ > 0, "The min executor number of streaming dynamic " + "allocation must be positive.") .createOptional private[spark] val STREAMING_DYN_ALLOCATION_MAX_EXECUTORS = ConfigBuilder("spark.streaming.dynamicAllocation.maxExecutors") .version("3.0.0") .intConf .checkValue(_ > 0, "The max executor number of streaming dynamic " + "allocation must be positive.") .createWithDefault(Int.MaxValue) private[spark] val STREAMING_DYN_ALLOCATION_SCALING_INTERVAL = ConfigBuilder("spark.streaming.dynamicAllocation.scalingInterval") .version("3.0.0") .timeConf(TimeUnit.SECONDS) .checkValue(_ > 0, "The scaling interval of streaming dynamic " + "allocation must be positive.") .createWithDefault(60) private[spark] val STREAMING_DYN_ALLOCATION_SCALING_UP_RATIO = ConfigBuilder("spark.streaming.dynamicAllocation.scalingUpRatio") .version("3.0.0") .doubleConf .checkValue(_ > 0, "The scaling up ratio of streaming dynamic " + "allocation must be positive.") .createWithDefault(0.9) private[spark] val STREAMING_DYN_ALLOCATION_SCALING_DOWN_RATIO = ConfigBuilder("spark.streaming.dynamicAllocation.scalingDownRatio") .version("3.0.0") .doubleConf .checkValue(_ > 0, "The scaling down ratio of streaming dynamic " + "allocation must be positive.") .createWithDefault(0.3) // TODO: ------------ start:二次开发代码 --------------- // private[spark] val STREAMING_DYN_ALLOCATION_EXECUTOR_UP_STEP = ConfigBuilder("spark.streaming.dynamicAllocation.upStepSize") .version("3.0.0") .intConf .checkValue(_ > 0, "The number of executors dynamically applied for each time.") .createWithDefault(1) private[spark] val STREAMING_DYN_ALLOCATION_EXECUTOR_DOWN_STEP = ConfigBuilder("spark.streaming.dynamicAllocation.downStepSize") .version("3.0.0") .intConf .checkValue(_ > 0, "The number of executors dynamically released each time.") .createWithDefault(1) // TODO: ------------ end:二次开发代码 --------------- // } ================================================ FILE: fire-enhance/apache-spark/src/main/scala-spark-3.0/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils._ import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTablePartition} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.PartitionOverwriteMode import org.apache.spark.sql.util.SchemaUtils import java.io.IOException import scala.util.Try /** * A command for writing data to a [[HadoopFsRelation]]. Supports both overwriting and appending. * Writing to dynamic partitions is also supported. * * @param staticPartitions partial partitioning spec for write. This defines the scope of partition * overwrites: when the spec is empty, all partitions are overwritten. * When it covers a prefix of the partition keys, only partitions matching * the prefix are overwritten. * @param ifPartitionNotExists If true, only write if the partition does not exist. * Only valid for static partitions. */ case class InsertIntoHadoopFsRelationCommand( outputPath: Path, staticPartitions: TablePartitionSpec, ifPartitionNotExists: Boolean, partitionColumns: Seq[Attribute], bucketSpec: Option[BucketSpec], fileFormat: FileFormat, options: Map[String, String], query: LogicalPlan, mode: SaveMode, catalogTable: Option[CatalogTable], fileIndex: Option[FileIndex], outputColumnNames: Seq[String]) extends DataWritingCommand { private lazy val parameters = CaseInsensitiveMap(options) private[sql] lazy val dynamicPartitionOverwrite: Boolean = { val partitionOverwriteMode = parameters.get("partitionOverwriteMode") // scalastyle:off caselocale .map(mode => PartitionOverwriteMode.withName(mode.toUpperCase)) // scalastyle:on caselocale .getOrElse(SQLConf.get.partitionOverwriteMode) val enableDynamicOverwrite = partitionOverwriteMode == PartitionOverwriteMode.DYNAMIC // This config only makes sense when we are overwriting a partitioned dataset with dynamic // partition columns. enableDynamicOverwrite && mode == SaveMode.Overwrite && staticPartitions.size < partitionColumns.length } override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { // Most formats don't do well with duplicate columns, so lets not allow that SchemaUtils.checkColumnNameDuplication( outputColumnNames, s"when inserting into $outputPath", sparkSession.sessionState.conf.caseSensitiveAnalysis) val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(options) val fs = outputPath.getFileSystem(hadoopConf) val qualifiedOutputPath = outputPath.makeQualified(fs.getUri, fs.getWorkingDirectory) val partitionsTrackedByCatalog = sparkSession.sessionState.conf.manageFilesourcePartitions && catalogTable.isDefined && catalogTable.get.partitionColumnNames.nonEmpty && catalogTable.get.tracksPartitionsInCatalog var initialMatchingPartitions: Seq[TablePartitionSpec] = Nil var customPartitionLocations: Map[TablePartitionSpec, String] = Map.empty var matchingPartitions: Seq[CatalogTablePartition] = Seq.empty // When partitions are tracked by the catalog, compute all custom partition locations that // may be relevant to the insertion job. if (partitionsTrackedByCatalog) { matchingPartitions = sparkSession.sessionState.catalog.listPartitions( catalogTable.get.identifier, Some(staticPartitions)) initialMatchingPartitions = matchingPartitions.map(_.spec) customPartitionLocations = getCustomPartitionLocations( fs, catalogTable.get, qualifiedOutputPath, matchingPartitions) } val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = java.util.UUID.randomUUID().toString, outputPath = outputPath.toString, dynamicPartitionOverwrite = dynamicPartitionOverwrite) val doInsertion = if (mode == SaveMode.Append) { true } else { val pathExists = fs.exists(qualifiedOutputPath) (mode, pathExists) match { case (SaveMode.ErrorIfExists, true) => throw new AnalysisException(s"path $qualifiedOutputPath already exists.") case (SaveMode.Overwrite, true) => if (ifPartitionNotExists && matchingPartitions.nonEmpty) { false } else if (dynamicPartitionOverwrite) { // For dynamic partition overwrite, do not delete partition directories ahead. true } else { deleteMatchingPartitions(fs, qualifiedOutputPath, customPartitionLocations, committer) true } case (SaveMode.Overwrite, _) | (SaveMode.ErrorIfExists, false) => true case (SaveMode.Ignore, exists) => !exists case (s, exists) => throw new IllegalStateException(s"unsupported save mode $s ($exists)") } } if (doInsertion) { def refreshUpdatedPartitions(updatedPartitionPaths: Set[String]): Unit = { val updatedPartitions = updatedPartitionPaths.map(PartitioningUtils.parsePathFragment) if (partitionsTrackedByCatalog) { val newPartitions = updatedPartitions -- initialMatchingPartitions if (newPartitions.nonEmpty) { AlterTableAddPartitionCommand( catalogTable.get.identifier, newPartitions.toSeq.map(p => (p, None)), ifNotExists = true).run(sparkSession) } // For dynamic partition overwrite, we never remove partitions but only update existing // ones. if (mode == SaveMode.Overwrite && !dynamicPartitionOverwrite) { val deletedPartitions = initialMatchingPartitions.toSet -- updatedPartitions if (deletedPartitions.nonEmpty) { AlterTableDropPartitionCommand( catalogTable.get.identifier, deletedPartitions.toSeq, ifExists = true, purge = false, retainData = true /* already deleted */).run(sparkSession) } } } } val updatedPartitionPaths = FileFormatWriter.write( sparkSession = sparkSession, plan = child, fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec( qualifiedOutputPath.toString, customPartitionLocations, outputColumns), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = bucketSpec, statsTrackers = Seq(basicWriteJobStatsTracker(hadoopConf)), options = options) // update metastore partition metadata if (updatedPartitionPaths.isEmpty && staticPartitions.nonEmpty && partitionColumns.length == staticPartitions.size) { // Avoid empty static partition can't loaded to datasource table. val staticPathFragment = PartitioningUtils.getPathFragment(staticPartitions, partitionColumns) refreshUpdatedPartitions(Set(staticPathFragment)) } else { refreshUpdatedPartitions(updatedPartitionPaths) } // refresh cached files in FileIndex fileIndex.foreach(_.refresh()) // refresh data cache if table is cached sparkSession.sharedState.cacheManager.recacheByPath(sparkSession, outputPath, fs) if (catalogTable.nonEmpty) { CommandUtils.updateTableStats(sparkSession, catalogTable.get) // TODO: ------------ start:二次开发代码 --------------- // if (catalogTable.get.partitionColumnNames.nonEmpty && updatedPartitionPaths.nonEmpty) { updatePartitionsMetadata(sparkSession, updatedPartitionPaths) } // TODO: ------------ end:二次开发代码 --------------- // } } else { logInfo("Skipping insertion into a relation that already exists.") } Seq.empty[Row] } /** * Update the specified partition metadata information. */ private def updatePartitionsMetadata(sparkSession: SparkSession, updatedPartitionPaths: Set[String]): Unit = { logInfo("Current partition table, will update partition information soon.") val catalog = sparkSession.sessionState.catalog val identifier = catalogTable.get.identifier try { val partitions = updatedPartitionPaths.map(partitionPath => Try { val partitionSpec = partitionPath.split("/").map(_.split("=")) .filter(_.length == 2).map {case Array(a, b) => (a, b)}.toMap catalog.getPartition(identifier, partitionSpec) }) val newPartitions = partitions.filter(_.isSuccess).map(_.get) .zipWithIndex.flatMap { case (p, _) => // Statistical partition file size val newSize = CommandUtils.calculateSingleLocationSize( sparkSession.sessionState, identifier, Some(p.location)) val rowCount = if (p.stats.isDefined && p.stats.get.rowCount.isDefined) { p.stats.get.rowCount.get } else BigInt(1) val newStats = CommandUtils.compareAndGetNewStats(p.stats, newSize, Some(rowCount)) val numFiles = p.parameters.getOrElse("numFiles", "1") val newStatParameters = Map("numFiles" -> numFiles, "rawDataSize" -> newSize.toString, "totalSize" -> newSize.toString) val newParameters = p.parameters ++ newStatParameters newStats.map(_ => p.copy(stats = newStats, parameters = newParameters)) } // update metastore partition metadata catalog.alterPartitions(identifier, newPartitions.toSeq) logInfo(s"All partition information updates have been completed") } catch { case e: Throwable => logError( "Partition table metadata information update failed.", e) } } /** * Deletes all partition files that match the specified static prefix. Partitions with custom * locations are also cleared based on the custom locations map given to this class. */ private def deleteMatchingPartitions( fs: FileSystem, qualifiedOutputPath: Path, customPartitionLocations: Map[TablePartitionSpec, String], committer: FileCommitProtocol): Unit = { val staticPartitionPrefix = if (staticPartitions.nonEmpty) { "/" + partitionColumns.flatMap { p => staticPartitions.get(p.name).map(getPartitionPathString(p.name, _)) }.mkString("/") } else { "" } // first clear the path determined by the static partition keys (e.g. /table/foo=1) val staticPrefixPath = qualifiedOutputPath.suffix(staticPartitionPrefix) if (fs.exists(staticPrefixPath) && !committer.deleteWithJob(fs, staticPrefixPath, true)) { throw new IOException(s"Unable to clear output " + s"directory $staticPrefixPath prior to writing to it") } // now clear all custom partition locations (e.g. /custom/dir/where/foo=2/bar=4) for ((spec, customLoc) <- customPartitionLocations) { assert( (staticPartitions.toSet -- spec).isEmpty, "Custom partition location did not match static partitioning keys") val path = new Path(customLoc) if (fs.exists(path) && !committer.deleteWithJob(fs, path, true)) { throw new IOException(s"Unable to clear partition " + s"directory $path prior to writing to it") } } } /** * Given a set of input partitions, returns those that have locations that differ from the * Hive default (e.g. /k1=v1/k2=v2). These partitions were manually assigned locations by * the user. * * @return a mapping from partition specs to their custom locations */ private def getCustomPartitionLocations( fs: FileSystem, table: CatalogTable, qualifiedOutputPath: Path, partitions: Seq[CatalogTablePartition]): Map[TablePartitionSpec, String] = { partitions.flatMap { p => val defaultLocation = qualifiedOutputPath.suffix( "/" + PartitioningUtils.getPathFragment(p.spec, table.partitionSchema)).toString val catalogLocation = new Path(p.location).makeQualified( fs.getUri, fs.getWorkingDirectory).toString if (catalogLocation != defaultLocation) { Some(p.spec -> catalogLocation) } else { None } }.toMap } } ================================================ FILE: fire-enhance/apache-spark/src/main/scala-spark-3.0/org/apache/spark/streaming/scheduler/ExecutorAllocationManager.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.streaming.scheduler import scala.util.Random import org.apache.spark.{ExecutorAllocationClient, SparkConf} import org.apache.spark.internal.Logging import org.apache.spark.internal.config.Streaming._ import org.apache.spark.streaming.util.RecurringTimer import org.apache.spark.util.{Clock, Utils} /** * Class that manages executors allocated to a StreamingContext, and dynamically requests or kills * executors based on the statistics of the streaming computation. This is different from the core * dynamic allocation policy; the core policy relies on executors being idle for a while, but the * micro-batch model of streaming prevents any particular executors from being idle for a long * time. Instead, the measure of "idle-ness" needs to be based on the time taken to process * each batch. * * At a high level, the policy implemented by this class is as follows: * - Use StreamingListener interface get batch processing times of completed batches * - Periodically take the average batch completion times and compare with the batch interval * - If (avg. proc. time / batch interval) >= scaling up ratio, then request more executors. * The number of executors requested is based on the ratio = (avg. proc. time / batch interval). * - If (avg. proc. time / batch interval) <= scaling down ratio, then try to kill an executor that * is not running a receiver. * * This features should ideally be used in conjunction with backpressure, as backpressure ensures * system stability, while executors are being readjusted. * * Note that an initial set of executors (spark.executor.instances) was allocated when the * SparkContext was created. This class scales executors up/down after the StreamingContext * has started. */ private[streaming] class ExecutorAllocationManager( client: ExecutorAllocationClient, receiverTracker: ReceiverTracker, conf: SparkConf, batchDurationMs: Long, clock: Clock) extends StreamingListener with Logging { private val scalingIntervalSecs = conf.get(STREAMING_DYN_ALLOCATION_SCALING_INTERVAL) private val scalingUpRatio = conf.get(STREAMING_DYN_ALLOCATION_SCALING_UP_RATIO) private val scalingDownRatio = conf.get(STREAMING_DYN_ALLOCATION_SCALING_DOWN_RATIO) private val minNumExecutors = conf.get(STREAMING_DYN_ALLOCATION_MIN_EXECUTORS) .getOrElse(math.max(1, receiverTracker.numReceivers())) private val maxNumExecutors = conf.get(STREAMING_DYN_ALLOCATION_MAX_EXECUTORS) private val upStep = conf.get(STREAMING_DYN_ALLOCATION_EXECUTOR_UP_STEP) private val downStep = conf.get(STREAMING_DYN_ALLOCATION_EXECUTOR_DOWN_STEP) private val timer = new RecurringTimer(clock, scalingIntervalSecs * 1000, _ => manageAllocation(), "streaming-executor-allocation-manager") @volatile private var batchProcTimeSum = 0L @volatile private var batchProcTimeCount = 0 validateSettings() def start(): Unit = { timer.start() logInfo(s"ExecutorAllocationManager started with " + s"ratios = [$scalingUpRatio, $scalingDownRatio] and interval = $scalingIntervalSecs sec") } def stop(): Unit = { timer.stop(interruptTimer = true) logInfo("ExecutorAllocationManager stopped") } /** * Manage executor allocation by requesting or killing executors based on the collected * batch statistics. */ private def manageAllocation(): Unit = synchronized { logInfo(s"Managing executor allocation with ratios = [$scalingUpRatio, $scalingDownRatio]") if (batchProcTimeCount > 0) { val averageBatchProcTime = batchProcTimeSum / batchProcTimeCount val ratio = averageBatchProcTime.toDouble / batchDurationMs logInfo(s"Average: $averageBatchProcTime, ratio = $ratio" ) if (ratio >= scalingUpRatio) { logDebug("Requesting executors") // TODO: ------------ start:二次开发代码 --------------- // val numNewExecutors = math.max(math.round(ratio).toInt, upStep) // TODO: ------------ end:二次开发代码 --------------- // requestExecutors(numNewExecutors) } else if (ratio <= scalingDownRatio) { logDebug("Killing executors") // TODO: ------------ start:二次开发代码 --------------- // (1 to downStep).foreach(_ => killExecutor()) // TODO: ------------ end:二次开发代码 --------------- // } } batchProcTimeSum = 0 batchProcTimeCount = 0 } /** Request the specified number of executors over the currently active one */ private def requestExecutors(numNewExecutors: Int): Unit = { require(numNewExecutors >= 1) val allExecIds = client.getExecutorIds() logDebug(s"Executors (${allExecIds.size}) = ${allExecIds}") val targetTotalExecutors = math.max(math.min(maxNumExecutors, allExecIds.size + numNewExecutors), minNumExecutors) client.requestTotalExecutors(targetTotalExecutors, 0, Map.empty) logInfo(s"Requested total $targetTotalExecutors executors") } /** Kill an executor that is not running any receiver, if possible */ private def killExecutor(): Unit = { val allExecIds = client.getExecutorIds() logDebug(s"Executors (${allExecIds.size}) = ${allExecIds}") if (allExecIds.nonEmpty && allExecIds.size > minNumExecutors) { val execIdsWithReceivers = receiverTracker.allocatedExecutors.values.flatten.toSeq logInfo(s"Executors with receivers (${execIdsWithReceivers.size}): ${execIdsWithReceivers}") val removableExecIds = allExecIds.diff(execIdsWithReceivers) logDebug(s"Removable executors (${removableExecIds.size}): ${removableExecIds}") if (removableExecIds.nonEmpty) { val execIdToRemove = removableExecIds(Random.nextInt(removableExecIds.size)) client.killExecutor(execIdToRemove) logInfo(s"Requested to kill executor $execIdToRemove") } else { logInfo(s"No non-receiver executors to kill") } } else { logInfo("No available executor to kill") } } private def addBatchProcTime(timeMs: Long): Unit = synchronized { batchProcTimeSum += timeMs batchProcTimeCount += 1 logDebug( s"Added batch processing time $timeMs, sum = $batchProcTimeSum, count = $batchProcTimeCount") } private def validateSettings(): Unit = { require( scalingUpRatio > scalingDownRatio, s"Config ${STREAMING_DYN_ALLOCATION_SCALING_UP_RATIO.key} must be more than config " + s"${STREAMING_DYN_ALLOCATION_SCALING_DOWN_RATIO.key}") if (conf.contains(STREAMING_DYN_ALLOCATION_MIN_EXECUTORS.key) && conf.contains(STREAMING_DYN_ALLOCATION_MAX_EXECUTORS.key)) { require( maxNumExecutors >= minNumExecutors, s"Config ${STREAMING_DYN_ALLOCATION_MAX_EXECUTORS.key} must be more than config " + s"${STREAMING_DYN_ALLOCATION_MIN_EXECUTORS.key}") } } override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = { logDebug("onBatchCompleted called: " + batchCompleted) if (!batchCompleted.batchInfo.outputOperationInfos.values.exists(_.failureReason.nonEmpty)) { batchCompleted.batchInfo.processingDelay.foreach(addBatchProcTime) } } } private[streaming] object ExecutorAllocationManager extends Logging { def isDynamicAllocationEnabled(conf: SparkConf): Boolean = { val streamingDynamicAllocationEnabled = Utils.isStreamingDynamicAllocationEnabled(conf) if (Utils.isDynamicAllocationEnabled(conf) && streamingDynamicAllocationEnabled) { throw new IllegalArgumentException( """ |Dynamic Allocation cannot be enabled for both streaming and core at the same time. |Please disable core Dynamic Allocation by setting spark.dynamicAllocation.enabled to |false to use Dynamic Allocation in streaming. """.stripMargin) } streamingDynamicAllocationEnabled } def createIfEnabled( client: ExecutorAllocationClient, receiverTracker: ReceiverTracker, conf: SparkConf, batchDurationMs: Long, clock: Clock): Option[ExecutorAllocationManager] = { if (isDynamicAllocationEnabled(conf) && client != null) { Some(new ExecutorAllocationManager(client, receiverTracker, conf, batchDurationMs, clock)) } else None } } ================================================ FILE: fire-enhance/pom.xml ================================================ 4.0.0 fire-enhance pom Fire : Enhance : com.zto.fire fire-parent 2.3.2-SNAPSHOT ../pom.xml apache-spark apache-flink apache-arthas org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-examples/flink-examples/pom.xml ================================================ 4.0.0 flink-examples_${flink.reference} jar Fire : Examples : Flink com.zto.fire fire-examples 2.3.2-SNAPSHOT ../pom.xml com.zto.fire fire-common_${scala.binary.version} ${fire.version} com.zto.fire fire-core_${scala.binary.version} ${fire.version} com.zto.fire fire-flink_${flink.reference} ${fire.version} com.zto.fire fire-enhance-flink_${flink.reference} ${fire.version} com.zto.fire fire-enhance-arthas_${scala.binary.version} ${fire.version} com.zto.fire fire-connector-hbase_${scala.binary.version} ${fire.version} com.zto.fire fire-connector-jdbc_${scala.binary.version} ${fire.version} com.zto.fire fire-connector-flink-rocketmq_${flink.reference} ${fire.version} com.zto.fire fire-metrics_${scala.binary.version} ${fire.version} com.sparkjava spark-core ${sparkjava.version} org.apache.flink flink-java ${flink.version} ${maven.scope} org.apache.flink flink-scala_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-streaming-scala_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-clients_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-runtime-web_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-queryable-state-client-java ${flink.version} ${maven.scope} org.apache.flink flink-statebackend-rocksdb_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-connector-kafka_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.kafka kafka_${scala.binary.version} ${kafka.version} ${maven.scope} org.apache.flink flink-table-planner_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-table-api-java-bridge_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-table-api-java ${flink.version} ${maven.scope} org.apache.flink flink-table-api-scala-bridge_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-table-common ${flink.version} ${maven.scope} org.apache.flink flink-connector-hive_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-connector-jdbc_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-json ${flink.version} ${maven.scope} org.apache.flink flink-connector-elasticsearch-base_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-hadoop-compatibility_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.rocketmq rocketmq-client ${rocketmq.version} org.apache.rocketmq rocketmq-acl ${rocketmq.version} org.apache.flink flink-orc-nohive_${scala.binary.version} ${flink.version} org.apache.flink flink-shaded-hadoop-2-uber 2.6.5-8.0 ${maven.scope} javax.servlet servlet-api org.apache.hive hive-exec ${hive.flink.version} ${maven.scope} calcite-core org.apache.calcite org.apache.hbase hbase-common ${hbase.version} ${maven.scope} org.apache.hbase hbase-client org.apache.hbase hbase-server ${hbase.version} ${maven.scope} org.apache.hbase hbase-client org.apache.hbase hbase-client ${hbase.version} ${maven.scope} calcite-core org.apache.calcite com.oracle ojdbc6 11.2.0.3 ${maven.scope} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-examples/flink-examples/src/main/java/com/zto/fire/examples/bean/People.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.bean; import java.math.BigDecimal; import java.util.LinkedList; import java.util.List; public class People { private Long id; private String name; private Integer age; private Double length; private BigDecimal data; public People() { } public People(Long id, String name, Integer age, Double length, BigDecimal data) { this.id = id; this.name = name; this.age = age; this.length = length; this.data = data; } public static List createList() { List list = new LinkedList<>(); for (int i=0; i<10; i++) { list.add(new People((long) i, "admin_" + i, i, i * 0.1, new BigDecimal(i * 10.1012))); } return list; } } ================================================ FILE: fire-examples/flink-examples/src/main/java/com/zto/fire/examples/bean/Student.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.bean; import com.zto.fire.common.anno.FieldName; import com.zto.fire.common.util.JSONUtils; import com.zto.fire.hbase.bean.HBaseBaseBean; import com.zto.fire.common.util.DateFormatUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.math.BigDecimal; import java.util.Arrays; import java.util.LinkedList; import java.util.List; import java.util.Objects; /** * 对应HBase表的JavaBean * * @author ChengLong 2019-6-20 16:06:16 */ public class Student extends HBaseBaseBean { @FieldName(value = "Student", disuse = true) protected static final transient Logger logger = LoggerFactory.getLogger(Student.class); private Long id; private String name; private Integer age; // 多列族情况下需使用family单独指定 private String createTime; // 若JavaBean的字段名称与HBase中的字段名称不一致,需使用value单独指定 // 此时hbase中的列名为length1,而不是length // @FieldName(family = "info", value = "length1") private BigDecimal length; private Boolean sex; /** * rowkey的构建 * * @return */ @Override public Student buildRowKey() { this.rowKey = this.id.toString(); return this; } public Student(Long id, String name) { this.id = id; this.name = name; } public Student(Long id, String name, Integer age) { this.id = id; this.name = name; this.age = age; } public Student(Long id, String name, Integer age, BigDecimal length, Boolean sex, String createTime) { this.id = id; this.name = name; this.age = age; this.length = length; this.sex = sex; this.createTime = createTime; } public Student(Long id, String name, Integer age, BigDecimal length) { this.id = id; this.name = name; this.age = age; this.length = length; } public Student() { } public Student(Long id) { this.id = id; } public String getCreateTime() { return createTime; } public void setCreateTime(String createTime) { this.createTime = createTime; } public BigDecimal getLength() { return length; } public void setLength(BigDecimal length) { this.length = length; } public Boolean getSex() { return sex; } public void setSex(Boolean sex) { this.sex = sex; } public Long getId() { return id; } public void setId(Long id) { this.id = id; } public String getName() { return name; } public void setName(String name) { this.name = name; } public Integer getAge() { return age; } public void setAge(Integer age) { this.age = age; } public void setClassName(String name) {} @Override public String toString() { return JSONUtils.toJSONString(this); } public static List newStudentList() { String dateTime = DateFormatUtils.formatCurrentDateTime(); return Arrays.asList( new Student(1L, "admin", 12, BigDecimal.valueOf(12.1), true, dateTime), new Student(2L, "root", 22, BigDecimal.valueOf(22), true, dateTime), new Student(3L, "scala", 11, BigDecimal.valueOf(11), true, dateTime), new Student(4L, "spark", 15, BigDecimal.valueOf(15), true, dateTime), new Student(5L, "java", 16, BigDecimal.valueOf(16.1), true, dateTime), new Student(6L, "hive", 17, BigDecimal.valueOf(17.1), true, dateTime), new Student(7L, "presto", 18, BigDecimal.valueOf(18.1), true, dateTime), new Student(8L, "flink", 19, BigDecimal.valueOf(19.1), true, dateTime), new Student(9L, "streaming", 10, BigDecimal.valueOf(10.1), true, dateTime), new Student(10L, "sql", 12, BigDecimal.valueOf(12.1), true, dateTime) ); } /** * 构建student集合 * * @return */ public static List buildStudentList() { List studentList = new LinkedList<>(); try { for (int i = 1; i <= 1; i++) { Thread.sleep(500); Student stu = new Student(1L, "root", i + 1, BigDecimal.valueOf((long) 1 + i), true, DateFormatUtils.formatCurrentDateTime()); studentList.add(stu); } for (int i = 1; i <= 2; i++) { Thread.sleep(500); Student stu = new Student(2L, "admin", i + 2, BigDecimal.valueOf(2019.05180919 + i), false, DateFormatUtils.formatCurrentDateTime()); studentList.add(stu); } for (int i = 1; i <= 3; i++) { Thread.sleep(500); Student stu = new Student(3L, "spark", i + 3, BigDecimal.valueOf(33.1415926 + i)); studentList.add(stu); } for (int i = 1; i <= 3; i++) { Thread.sleep(500); Student stu = new Student(4L, "flink", i + 4, BigDecimal.valueOf(4.2 + i), true, DateFormatUtils.formatCurrentDateTime()); studentList.add(stu); } for (int i = 1; i <= 3; i++) { Thread.sleep(500); Student stu = new Student(5L, "hadoop", i + 5, BigDecimal.valueOf(5.5 + i), false, DateFormatUtils.formatCurrentDateTime()); studentList.add(stu); } for (int i = 1; i <= 3; i++) { Thread.sleep(500); Student stu = new Student(6L, "hbase", i + 6, BigDecimal.valueOf(66.66 + i), true, DateFormatUtils.formatCurrentDateTime()); studentList.add(stu); } } catch (Exception e) { logger.error("Sleep线程异常", e); } return studentList; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (!(o instanceof Student)) { return false; } Student student = (Student) o; return Objects.equals(id, student.id) && Objects.equals(name, student.name) && Objects.equals(age, student.age) && Objects.equals(createTime, student.createTime) && Objects.equals(length, student.length) && Objects.equals(sex, student.sex); } @Override public int hashCode() { return Objects.hash(id, name, age, createTime, length, sex); } } ================================================ FILE: fire-examples/flink-examples/src/main/java/com/zto/fire/sql/SqlCommandParser.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.sql; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.util.*; import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; public final class SqlCommandParser { protected static final transient Logger logger = LoggerFactory.getLogger(SqlCommandParser.class); private SqlCommandParser() {} public static List parse(List lines) { List calls = new ArrayList<>(); StringBuilder stmt = new StringBuilder(); for (String line : lines) { if (line.trim().isEmpty() || line.startsWith("--")) { // skip empty line and comment line continue; } stmt.append("\n").append(line); if (line.trim().endsWith(";")) { Optional optionalCall = parse(stmt.toString()); if (optionalCall.isPresent()) { calls.add(optionalCall.get()); } else { throw new RuntimeException("Unsupported command '" + stmt.toString() + "'"); } // clear string builder stmt.setLength(0); } } return calls; } public static Optional parse(String stmt) { // normalize stmt = stmt.trim(); // remove ';' at the end if (stmt.endsWith(";")) { stmt = stmt.substring(0, stmt.length() - 1).trim(); } // parse for (SqlCommand cmd : SqlCommand.values()) { final Matcher matcher = cmd.pattern.matcher(stmt); if (matcher.matches()) { final String[] groups = new String[matcher.groupCount()]; for (int i = 0; i < groups.length; i++) { groups[i] = matcher.group(i + 1); } return cmd.operandConverter.apply(groups) .map((operands) -> new SqlCommandCall(cmd, operands)); } } return Optional.empty(); } private static final Function> NO_OPERANDS = (operands) -> Optional.of(new String[0]); private static final Function> SINGLE_OPERAND = (operands) -> Optional.of(new String[]{operands[0]}); private static final int DEFAULT_PATTERN_FLAGS = Pattern.CASE_INSENSITIVE | Pattern.DOTALL; /** * Supported SQL commands. */ public enum SqlCommand { INSERT_INTO( "(INSERT\\s+INTO.*)", SINGLE_OPERAND), CREATE_TABLE( "(CREATE\\s+TABLE.*)", SINGLE_OPERAND), CREATE_VIEW( "(CREATE\\s+VIEW.*)", SINGLE_OPERAND), SET( "SET(\\s+(\\S+)\\s*=(.*))?", // whitespace is only ignored on the left side of '=' (operands) -> { if (operands.length < 3) { return Optional.empty(); } else if (operands[0] == null) { return Optional.of(new String[0]); } return Optional.of(new String[]{operands[1], operands[2]}); }); public final Pattern pattern; public final Function> operandConverter; SqlCommand(String matchingRegex, Function> operandConverter) { this.pattern = Pattern.compile(matchingRegex, DEFAULT_PATTERN_FLAGS); this.operandConverter = operandConverter; } @Override public String toString() { return super.toString().replace('_', ' '); } public boolean hasOperands() { return operandConverter != NO_OPERANDS; } } /** * Call of SQL command with operands and command type. */ public static class SqlCommandCall { public final SqlCommand command; public final String[] operands; public SqlCommandCall(SqlCommand command, String[] operands) { this.command = command; this.operands = operands; } public SqlCommandCall(SqlCommand command) { this(command, new String[0]); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } SqlCommandCall that = (SqlCommandCall) o; return command == that.command && Arrays.equals(operands, that.operands); } @Override public int hashCode() { int result = Objects.hash(command); result = 31 * result + Arrays.hashCode(operands); return result; } @Override public String toString() { return command + "(" + Arrays.toString(operands) + ")"; } } private static FileSystem getFiledSystem() throws IOException { Configuration configuration = new Configuration(); FileSystem fileSystem = FileSystem.get(configuration); return fileSystem; } public static void copyHdfsFileToLocal(String filePath, String disFile){ logger.info("copy hdfs to local :" + filePath + ", hdfs:" + disFile); FSDataInputStream fsDataInputStream = null; try { File file = new File(disFile); if(file.exists()){ file.delete(); file = new File(disFile); } Path path = new Path(filePath); fsDataInputStream = getFiledSystem().open(path); IOUtils.copyBytes(fsDataInputStream, new FileOutputStream(file), 4096, false); } catch (IOException e) { e.printStackTrace(); } finally { if(fsDataInputStream != null){ IOUtils.closeStream(fsDataInputStream); } } } private static void writeHDFS(String localPath, String hdfsPath){ logger.info("copy file to hdfs :" + localPath + ", hdfs:" + hdfsPath); FSDataOutputStream outputStream = null; FileInputStream fileInputStream = null; try { Path path = new Path(hdfsPath); FileSystem fileSystem = getFiledSystem(); if(fileSystem.exists(path)){ fileSystem.delete(path); } outputStream = fileSystem.create(path); fileInputStream = new FileInputStream(new File(localPath)); IOUtils.copyBytes(fileInputStream, outputStream,4096, false); } catch (IOException e) { e.printStackTrace(); }finally { if(fileInputStream != null){ IOUtils.closeStream(fileInputStream); } if(outputStream != null){ IOUtils.closeStream(outputStream); } } } /** * 接受sql文件,上传到hdfs,供run-application模式下载文件到本地,文件必须为绝对路径 * @param args */ public static void main(String[] args) { String sqlFile = args[0]; logger.info("sqlFile:" + sqlFile); String path = "/tmp/" + sqlFile.substring(sqlFile.lastIndexOf("/") + 1); logger.info("path:" + path); writeHDFS(sqlFile, path); } } ================================================ FILE: fire-examples/flink-examples/src/main/resources/META-INF/services/org.apache.flink.table.factories.Factory ================================================ com.zto.fire.flink.sql.connector.rocketmq.RocketMQDynamicTableFactory ================================================ FILE: fire-examples/flink-examples/src/main/resources/common.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # fire.analysis.arthas.tunnel_server.url = ws://arthas_tunnel_server:7777/ws # \u5B9A\u4E49url\u7684\u522B\u540D\u4E0Eurl\u5BF9\u5E94\u5173\u7CFB\uFF0C\u540E\u7EED\u53EF\u901A\u8FC7\u522B\u540D\u8FDB\u884C\u914D\u7F6E flink.db.jdbc.url.map.test = jdbc:mysql://mysql-server:3306/fire # \u652F\u6301\u522B\u540D\u6216\u76F4\u63A5\u6307\u5B9Aurl flink.db.jdbc.url = test flink.db.jdbc.driver = com.mysql.jdbc.Driver flink.db.jdbc.user = root flink.db.jdbc.password = fire flink.db.jdbc.url2 = jdbc:mysql://mysql-server:3306/fire2 flink.db.jdbc.driver2 = com.mysql.jdbc.Driver flink.db.jdbc.user2 = root flink.db.jdbc.password2 = fire flink.db.jdbc.url6 = jdbc:mysql://mysql-server:3306/fire6 flink.db.jdbc.driver6 = com.mysql.jdbc.Driver flink.db.jdbc.user6 = root flink.db.jdbc.password6 = fire flink.db.jdbc.url7 = jdbc:mysql://mysql-server:3306/fire7 flink.db.jdbc.driver7 = com.mysql.jdbc.Driver flink.db.jdbc.user7 = root flink.db.jdbc.password7 = fire flink.db.jdbc.url8 = jdbc:mysql://mysql-server:3306/fire8 flink.db.jdbc.driver8 = com.mysql.jdbc.Driver flink.db.jdbc.user8 = root flink.db.jdbc.password8 = fire fire.rest.filter.enable = false fire.rest.url.show.enable = true # ---------------------------------------------- < \u5F02\u5E38\u8BCA\u65AD\u914D\u7F6E > ----------------------------------------------- # fire.analysis.log.exception.send.mq.url = bigdata_test fire.analysis.log.exception.stack.enable = true # \u662F\u5426\u6253\u5370\u914D\u7F6E\u4FE1\u606F fire.conf.show.enable = true # ------------------------------- < Flink SQL with\u4E2Ddatasource\u5BF9\u5E94\u7684\u6570\u636E\u6E90\u5B9A\u4E49 > ----------------------------- # # flink.sql.with.\u4E3A\u524D\u7F00\u5B9A\u4E49flink sql\u4E2Doptions\u7684\u522B\u540D\uFF0C\u7136\u540E\u5728sql\u4E2D\u901A\u8FC7datasource\u5F15\u7528 # \u6BD4\u5982\u4E0B\u9762\u914D\u7F6E\u5B9A\u4E49\u4E86\u540D\u4E3Ajdbc_test\u7684\u6570\u636E\u6E90\u522B\u540D\uFF0Csql\u4E2D\u5219\u4F7F\u7528 'datasource' = 'jdbc_test'\u5F15\u7528 # fire\u6846\u67B6\u4F1A\u81EA\u52A8\u5C06\u6240\u6709\u522B\u540D\u4E3Ajdbc_test\u7684options\u66FF\u6362\u5230sql\u4E2D flink.sql.with.jdbc_test.connector = jdbc flink.sql.with.jdbc_test.url = jdbc:mysql://mysql-server:3306/fire flink.sql.with.jdbc_test.username = root flink.sql.with.jdbc_test.password = root flink.sql.with.jdbc_test.driver = com.mysql.jdbc.Driver flink.sql.with.kafka_test.connector = kafka flink.sql.with.kafka_test.topic = fire flink.sql.with.kafka_test.properties.bootstrap.servers=kafka-server:9092 flink.sql.with.kafka_test.properties.group.id= fire flink.sql.with.kafka_test2.connector = kafka flink.sql.with.kafka_test2.topic = fire2 flink.sql.with.kafka_test2.properties.bootstrap.servers=kafka-server:9092 flink.sql.with.kafka_test2.properties.group.id= fire # ---------------------------------------------- < \u8840\u7F18\u91C7\u96C6\u914D\u7F6E > ----------------------------------------------- # # \u662F\u5426\u5F00\u542F\u5C06\u8840\u7F18\u4FE1\u606F\u53D1\u9001\u5230\u6D88\u606F\u961F\u5217 fire.lineage.send.mq.enable = true fire.lineage.send.mq.url = bigdata_test # \u5B9A\u65F6\u89E3\u6790\u57CB\u70B9SQL\u7684\u6267\u884C\u9891\u7387\uFF08s\uFF09 fire.lineage.run.period = 10 ================================================ FILE: fire-examples/flink-examples/src/main/resources/connector/hive/HiveSinkTest.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # flink.hive.cluster = test flink.sql.udf.fireUdf.enable = false flink.kafka.brokers.name = bigdata_test # \u5FC5\u987B\u914D\u7F6E\u9879\uFF1Akafka\u7684topic\u5217\u8868\uFF0C\u4EE5\u9017\u53F7\u5206\u9694 flink.kafka.topics = fire flink.kafka.group.id = fire flink.fire.rest.filter.enable = false flink.fire.config_center.enable = true flink.fire.rest.url.show.enable = true flink.db.jdbc.batch.size3 = 3 flink.stream.checkpoint.interval = 10000 # flink\u6240\u652F\u6301\u7684\u53C2\u6570 state.checkpoints.num-retained = 3 state.backend.incremental = true state.backend.rocksdb.files.open = 5000 flink.sql.log.enable = true flink.sql_with.replaceMode.enable = true # sql\u4E2Dwith\u8868\u8FBE\uFF0C\u914D\u7F6E\u65B9\u6CD5\u662F\u4EE5flink.sql.with\u5F00\u5934\uFF0C\u8DDF\u4E0Aconnector\u7684key\uFF0C\u4EE5\u6570\u5B57\u7ED3\u5C3E\uFF0C\u7528\u4E8E\u533A\u5206\u4E0D\u540C\u7684connector flink.sql.with.connector=jdbc flink.sql.with.url=jdbc:mysql://localhost:3306/mydatabase flink.sql.with.table-name=users flink.sql.with.password=fire flink.sql.with.connector2=jdbc2 flink.sql.with.url2=jdbc2:mysql://localhost:3306/mydatabase flink.sql.with.table-name2=users2 flink.sql.with.password2=fire flink.rocket.topics=fire flink.rocket.group.id=fire flink.rocket.brokers.name=localhost:9876 ================================================ FILE: fire-examples/flink-examples/src/main/resources/log4j.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # log4j.rootLogger = WARN, stdout, D ### 输出到控制台 ### log4j.appender.stdout = org.apache.log4j.ConsoleAppender log4j.appender.stdout.Target = System.out log4j.appender.stdout.layout = org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss.SSS} [%thread]-[%p]-[%c] %m%n ### 输出到日志文件 ### log4j.appender.D = org.apache.log4j.DailyRollingFileAppender log4j.appender.D.File = ./fire.log log4j.appender.D.Append = true log4j.appender.D.Threshold = INFO log4j.appender.D.layout = org.apache.log4j.PatternLayout log4j.appender.D.layout.ConversionPattern = %-d{yyyy-MM-dd HH:mm:ss.SSS} [%thread]-[%p]-[%c]-[%l] %m%n ================================================ FILE: fire-examples/flink-examples/src/main/resources/stream/ConfigCenterTest.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # hive.cluster = test sql.udf.fireUdf.enable = false kafka.brokers.name = bigdata_test # 必须配置项:kafka的topic列表,以逗号分隔 kafka.topics = fire kafka.group.id = fire fire.rest.filter.enable = false fire.config_center.enable = true fire.rest.url.show.enable = true db.jdbc.batch.size3 = 3 stream.checkpoint.interval = 10000 # flink所支持的参数 state.checkpoints.num-retained = 3 state.backend.incremental = true state.backend.rocksdb.files.open = 5000 sql.log.enable = true sql_with.replaceMode.enable = true #fire.thread.pool.size=10 fire.thread.pool.size=6 fire.restful.max.thread=9 fire.jdbc.query.partitions=11 fire.hbase.scan.repartitions=110 fire.acc.log.max.size=22 fire.conf.test=scala ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/FlinkDemo.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.core.anno.connector.{Hive, Kafka} import com.zto.fire.core.anno.lifecycle.Process import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Streaming /** * 基于Fire进行Flink Streaming开发 * * @contact Fire框架技术交流群(钉钉):35373471 */ @Config( """ |# 支持Flink调优参数、Fire框架参数、用户自定义参数等 |state.checkpoints.num-retained=30 |state.checkpoints.dir=hdfs:///user/flink/checkpoint |""") @Hive("thrift://localhost:9083") // 配置连接到指定的hive @Streaming(interval = 100, unaligned = true, parallelism = 4) // 100s做一次checkpoint,开启非对齐checkpoint @Kafka(brokers = "localhost:9092", topics = "fire", groupId = "fire") object FlinkDemo extends FlinkStreaming { @Process def kafkaSource: Unit = { val dstream = this.fire.createKafkaDirectStream() // 使用api的方式消费kafka sql("""create table statement ...""") sql("""insert into statement ...""") } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/FlinkSQLDemo.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink import com.zto.fire.core.anno.lifecycle.{Step1, Step2, Step3} import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Streaming /** * 基于Fire进行flink sql开发 * 使用@Step注解的无参方法将按数值顺序依次被Fire框架调用: *

Step1. 定义源表表结构 * Step1. 执行耗时:105.00ms * * Step2. 定义目标表结构 * Step2. 执行耗时:5.00ms * * Step3. 执行insert语句 * Step3. 执行耗时:322.00ms * * Finished. 总计:3个 成功:3个 失败:0个, 执行耗时:433.00ms

* * @author ChengLong * @contact Fire框架技术交流群(钉钉):35373471 */ @Streaming(interval = 10) object FlinkSQLDemo extends FlinkStreaming { @Step1("定义源表表结构") def sourceTable: Unit = { sql(s""" | CREATE TABLE t_student ( | id BIGINT, | name STRING, | age INT, | createTime TIMESTAMP(13), | sex Boolean |) WITH ( | 'connector' = 'datagen', | 'rows-per-second'='100', -- 5000/s | 'fields.id.min'='1', -- id字段,1到1000之间 | 'fields.id.max'='1000', | 'fields.name.length'='5', -- name字段,长度为5 | 'fields.age.min'='1', -- age字段,1到120岁 | 'fields.age.max'='120' |) |""".stripMargin) } @Step2("定义目标表结构") def destTable: Unit = { sql(s""" |CREATE TABLE t_print_table WITH ('connector' = 'print') |LIKE t_student (EXCLUDING ALL) |""".stripMargin) } @Step3("执行insert语句") def insertStatement: Unit = { sql( s""" |insert into t_print_table |select | id, name, age, createTime, sex |from t_student |group by id, name, age, createTime, sex |""".stripMargin) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/Test.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.common.util.{DateFormatUtils, JSONUtils, ThreadUtils} import com.zto.fire.core.anno.connector._ import com.zto.fire.core.anno.lifecycle.{Process, Step1} import com.zto.fire.examples.bean.Student import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Streaming import com.zto.fire.flink.sync.FlinkLineageAccumulatorManager import com.zto.fire.hbase.HBaseConnector import com.zto.fire.predef.println import org.apache.flink.api.scala._ import java.util.concurrent.TimeUnit @HBase("test") @Streaming(interval = 60, unaligned = true, parallelism = 2) // 100s做一次checkpoint,开启非对齐checkpoint @RocketMQ(brokers = "bigdata_test", topics = "fire", groupId = "fire") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") @Jdbc(url = "jdbc:mysql://mysql-server:3306/fire", username = "root", password = "root") object Test extends FlinkStreaming { private val hbaseTable = "fire_test_1" private lazy val tableName = "spark_test" @Process def kafkaSource: Unit = { this.fire.createKafkaDirectStream().print() val dstream = this.fire.createRocketMqPullStream() dstream.map(t => { val timestamp = DateFormatUtils.formatCurrentDateTime() val insertSql = s"INSERT INTO $tableName (name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" this.fire.jdbcUpdate(insertSql, Seq("admin", 12, timestamp, 10.0, 1)) HBaseConnector.get[Student](hbaseTable, classOf[Student], Seq("1")) t }).print() } @Step1("获取血缘信息") def lineage: Unit = { ThreadUtils.scheduleAtFixedRate({ println(s"累加器值:" + JSONUtils.toJSONString(FlinkLineageAccumulatorManager.getValue)) }, 0, 10, TimeUnit.SECONDS) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/acc/FlinkAccTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.acc import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.core.anno.connector.Kafka import com.zto.fire.flink.FlinkStreaming import org.apache.flink.api.common.JobExecutionResult import org.apache.flink.api.common.functions.RichMapFunction import org.apache.flink.api.scala._ import org.apache.flink.streaming.api.scala.DataStream /** * fire-flink计数器与自定义累加器的使用 * * @author ChengLong 2020年1月11日 14:08:56 * @since 0.4.1 * @contact Fire框架技术交流群(钉钉):35373471 */ @Config( """ |flink.max.parallelism = 8 |""") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire", autoCommit = true) // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object FlinkAccTest extends FlinkStreaming { /** * 生命周期方法:具体的用户开发的业务逻辑代码 * 注:此方法会被自动调用,不需要在main中手动调用 */ override def process: Unit = { val dstream = this.fire.createCollectionStream(1 to 100) // 使用内置的计数器 this.testFlinkCounter(dstream) } /** * Fire中内置计数器的使用 */ def testFlinkCounter(dstream: DataStream[Int]): Unit = { // FireMapFunction功能较RichMapFunction等更为强大,推荐使用 // 创建FireMapFunction类型的内部类,支持Map、MapPartition、FlatMap等操作 // 在不同的map函数中进行累加全局有效 dstream.map(new RichMapFunction[Int, Int]() { override def map(value: Int): Int = { // 多值计数器根据累加器的值类型区分不同的计数器,比如传参为Double类型,则累加至DoubleCounter中 this.addCounter("LongCount", value.longValue()) this.addCounter("IntCount", value) this.addCounter("IntCount2", value * 2) this.addCounter("DoubleCount", value.doubleValue()) Thread.sleep(5000) value } }) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/batch/FireMapFunctionTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.batch import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.flink.FlinkBatch import org.apache.flink.api.common.functions.RichMapFunction import org.apache.flink.api.common.state.StateTtlConfig import org.apache.flink.api.common.time.Time import org.apache.flink.api.scala._ /** * 用于演示FireMapFunction的使用,FireMapFunction比RichMapFunction功能更强大 * 提供了多值计数器、常用API函数的便捷使用等,甚至同时支持:map、flatMap、mapPartition等操作 * 内部对状态的api进行了封装,使用起来更简洁 * * @author ChengLong 2020-4-9 15:59:19 * @contact Fire框架技术交流群(钉钉):35373471 */ @Config( """ |flink.fire.config_center.enable=false |""") object FireMapFunctionTest extends FlinkBatch { lazy val dataset = this.fire.createCollectionDataSet(1 to 10) lazy val dataset2 = this.fire.createCollectionDataSet(1 to 3) override def process: Unit = { this.testMap } /** * 使用FireMapFunction进行Map算子操作 */ private def testMap: Unit = { dataset.map(new RichMapFunction[Int, String]() { lazy val ttlConfig = StateTtlConfig.newBuilder(Time.days(1)).build() // 获取广播变量 lazy val brocastValue = this.getRuntimeContext.getBroadcastVariable[Int]("values") override def map(value: Int): String = { // 累加器使用详见:FlinkAccTest.scala this.addCounter("IntCount", 2) this.addCounter("LongCount", 3L) // 广播变量 this.brocastValue.foreach(println) // 状态使用,具有懒加载的能力,根据name从缓存中获取valueState,不需要声明为成员变量或在open方法中初始化 val valueState = this.getState[Int]("fire", ttlConfig) valueState.update(valueState.value()) val listState = this.getListState[Int]("fire_list") listState.add(value) val mapState = this.getMapState[Int, Int]("fire_map", ttlConfig) mapState.put(value, value) value.toString } }).withBroadcastSet(dataset2, "values").print() } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/batch/FlinkBatchTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.batch import com.zto.fire._ import com.zto.fire.flink.FlinkBatch import org.apache.flink.api.common.accumulators.IntCounter import org.apache.flink.api.common.functions.RichMapFunction import org.apache.flink.api.scala._ import org.apache.flink.configuration.Configuration import org.apache.flink.core.fs.FileSystem /** * @contact Fire框架技术交流群(钉钉):35373471 */ object FlinkBatchTest extends FlinkBatch { /** * 生命周期方法:具体的用户开发的业务逻辑代码 * 注:此方法会被自动调用,不需要在main中手动调用 */ override def process: Unit = { this.testAccumulator } def testAccumulator: Unit = { val result = this.fire.createCollectionDataSet(1 to 10).map(new RichMapFunction[Int, Int] { val counter = new IntCounter() override def open(parameters: Configuration): Unit = { this.getRuntimeContext.addAccumulator("myCounter", this.counter) } override def map(value: Int): Int = { this.counter.add(value) value } }) result.writeAsText("J:\\test\\flink.result", FileSystem.WriteMode.OVERWRITE) val result2 = this.fire.createCollectionDataSet(1 to 10).map(new RichMapFunction[Int, Int] { override def map(value: Int): Int = { this.getRuntimeContext.getIntCounter("myCounter").add(value) value } }) result2.writeAsText("J:\\test\\flink.result", FileSystem.WriteMode.OVERWRITE) val count = this.fire.execute("counter").getAccumulatorResult[Int]("myCounter") println("累加器结果:" + count) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/batch/FlinkBrocastTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.batch import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.core.anno.connector.Hive import com.zto.fire.flink.FlinkBatch import org.apache.flink.api.common.functions.RichMapFunction import org.apache.flink.api.scala._ /** * flink广播变量的使用 * * @author ChengLong 2020年2月18日 13:53:06 * @contact Fire框架技术交流群(钉钉):35373471 */ @Hive("test") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object FlinkBrocastTest extends FlinkBatch { override def process: Unit = { val ds = this.fire.createCollectionDataSet(Seq(1, 2, 3, 4, 5)) // flink中可以广播的数据必须是Dataset val brocastDS = this.fire.createCollectionDataSet(Seq("a", "b", "c", "d", "e")) ds.map(new RichMapFunction[Int, String] { // 获取广播变量中的值给当前成员变量(若不想在open方法中获取值,请使用lazy关键字) lazy val broadcastSet: Seq[String] = this.getBroadcastVariable[String]("brocastDS") override def map(value: Int): String = { this.broadcastSet(value - 1) } // 每次使用必须通过withBroadcastSet进行广播 }).withBroadcastSet(brocastDS, "brocastDS").print() } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/connector/FlinkHudiTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.connector import com.zto.fire.common.conf.FireKafkaConf import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Checkpoint @Checkpoint(60) object FlinkHudiTest extends FlinkStreaming { /** * 生命周期方法:具体的用户开发的业务逻辑代码 * 注:此方法会被自动调用,不需要在main中手动调用 */ override def process: Unit = { var sql = """ |CREATE TABLE hudi_table_test( | uuid VARCHAR(20), | action VARCHAR(10), | age INT, | ts BIGINT, | ds VARCHAR(20) |) |PARTITIONED BY (ds) |WITH ( | 'connector' = 'hudi', | 'path' = 'hdfs:///user/flink/huditest/hudi_table_test', | 'table.type' = 'MERGE_ON_READ', | 'compaction.delta_commits' = '3', | 'compaction.delta_seconds' = '300', | 'hoodie.datasource.write.hive_style_partitioning' = 'true' |) |""".stripMargin this.tableEnv.executeSql(sql) sql = s""" |CREATE TABLE kafka_source_table ( | uuid VARCHAR(20), | action VARCHAR(10), | age INT, | ts BIGINT, | ds VARCHAR(20) |) WITH ( | 'connector' = 'kafka', | 'topic' = 'kafka_hudi_test', | 'properties.bootstrap.servers' = '${FireKafkaConf.kafkaBrokers()}', | 'properties.group.id' = 'testGroup', | 'scan.startup.mode' = 'earliest-offset', | 'format' = 'json' |) |""".stripMargin this.tableEnv.executeSql(sql) sql = """ |INSERT INTO hudi_table_test SELECT uuid,action,age,ts,ds FROM kafka_source_table |""".stripMargin this.tableEnv.executeSql(sql) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/connector/bean/BeanConnectorTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.connector.bean import com.zto.fire._ import com.zto.fire.flink.FlinkStreaming /** * Flink流式计算任务模板 * * @author ChengLong * @since 1.0.0 * @create 2021-01-18 17:24 */ object BeanConnectorTest extends FlinkStreaming { override def process: Unit = { val dstream = this.fire.createKafkaDirectStream() sql( """ |CREATE table source ( | id bigint, | name string, | age int, | length double, | data DECIMAL(10, 5) |) |WITH | ( | 'connector' = 'bean', | 'table-name' = 'source', | 'duration' = '5000', | 'repeat-times' = '5' | ) |""".stripMargin) sql( """ |CREATE table sink ( | id bigint, | name string, | age int, | length double, | data DECIMAL(10, 5) |) |WITH | ( | 'connector' = 'bean', | 'table-name' = 'sink' | ) |""".stripMargin) sql( """ |insert into sink select * from source |""".stripMargin) dstream.print() } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/connector/bean/BeanDynamicTableFactory.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.connector.bean import com.zto.fire._ import org.apache.flink.configuration.ConfigOption import org.apache.flink.table.connector.sink.DynamicTableSink import org.apache.flink.table.connector.source.DynamicTableSource import org.apache.flink.table.factories.{DynamicTableFactory, DynamicTableSinkFactory, DynamicTableSourceFactory, FactoryUtil} import org.apache.flink.table.utils.TableSchemaUtils /** * sql connector的source与sink创建工厂 * * @author ChengLong 2021-5-7 15:48:03 */ class BeanDynamicTableFactory extends DynamicTableSourceFactory with DynamicTableSinkFactory { val IDENTIFIER = "bean" /** * 告诉工厂,如何创建Table Source实例 */ override def createDynamicTableSource(context: DynamicTableFactory.Context): DynamicTableSource = { val helper = FactoryUtil.createTableFactoryHelper(this, context) val config = helper.getOptions helper.validate() val physicalSchema = TableSchemaUtils.getPhysicalSchema(context.getCatalogTable.getSchema) new BeanDynamicTableSource(physicalSchema, config, physicalSchema.toRowDataType) } override def factoryIdentifier(): String = this.IDENTIFIER /** * 必填参数列表 */ override def requiredOptions(): JSet[ConfigOption[_]] = { val set = new JHashSet[ConfigOption[_]] set.add(BeanOptions.TABLE_NAME) set } /** * 可选的参数列表 */ override def optionalOptions(): JSet[ConfigOption[_]] = { val optionalOptions = new JHashSet[ConfigOption[_]] optionalOptions.add(BeanOptions.DURATION) optionalOptions.add(BeanOptions.repeatTimes) optionalOptions } /** * 创建table sink实例,在BeanDynamicTableSink中定义接收到的RowData如何sink */ override def createDynamicTableSink(context: DynamicTableFactory.Context): DynamicTableSink = { val helper = FactoryUtil.createTableFactoryHelper(this, context) val physicalSchema = TableSchemaUtils.getPhysicalSchema(context.getCatalogTable.getSchema) val config = helper.getOptions helper.validate() val dataType = context.getCatalogTable.getSchema.toPhysicalRowDataType new BeanDynamicTableSink(physicalSchema, config, dataType) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/connector/bean/BeanDynamicTableSink.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.connector.bean import com.zto.fire.predef._ import org.apache.flink.configuration.ReadableConfig import org.apache.flink.streaming.api.functions.sink.{RichSinkFunction, SinkFunction} import org.apache.flink.table.api.TableSchema import org.apache.flink.table.connector.ChangelogMode import org.apache.flink.table.connector.sink.{DynamicTableSink, SinkFunctionProvider} import org.apache.flink.table.data.RowData import org.apache.flink.table.types.DataType /** * sql connector的sink * @author ChengLong 2021-5-7 15:48:03 */ class BeanDynamicTableSink(tableSchema: TableSchema, options: ReadableConfig, dataType: DataType) extends DynamicTableSink { override def getChangelogMode(requestedMode: ChangelogMode): ChangelogMode = ChangelogMode.insertOnly() override def copy(): DynamicTableSink = new BeanDynamicTableSink(tableSchema, options, dataType) override def asSummaryString(): JString = "bean-sink" /** * 核心逻辑,定义如何将数据sink */ override def getSinkRuntimeProvider(context: DynamicTableSink.Context): DynamicTableSink.SinkRuntimeProvider = { SinkFunctionProvider.of(new RichSinkFunction[RowData] { override def invoke(value: RowData, context: SinkFunction.Context): Unit = { println("sink---> " + value.toString) } }) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/connector/bean/BeanDynamicTableSource.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.connector.bean import com.zto.fire.common.util.DateFormatUtils import com.zto.fire.examples.bean.People import com.zto.fire.flink.util.FlinkUtils import com.zto.fire.predef._ import org.apache.flink.configuration.ReadableConfig import org.apache.flink.streaming.api.functions.source.{RichSourceFunction, SourceFunction} import org.apache.flink.table.api.TableSchema import org.apache.flink.table.connector.ChangelogMode import org.apache.flink.table.connector.source.{DynamicTableSource, ScanTableSource, SourceFunctionProvider} import org.apache.flink.table.data.RowData import org.apache.flink.table.types.DataType import org.apache.flink.table.types.logical.RowType /** * 定义source table * * @author ChengLong 2021-5-7 15:48:03 */ class BeanDynamicTableSource(tableSchema: TableSchema, options: ReadableConfig, producedDataType: DataType) extends ScanTableSource { override def getChangelogMode: ChangelogMode = ChangelogMode.insertOnly() override def copy(): DynamicTableSource = new BeanDynamicTableSource(tableSchema, options, producedDataType) override def asSummaryString(): String = "bean" /** * 核心逻辑,定义如何产生source表的数据 */ override def getScanRuntimeProvider(scanContext: ScanTableSource.ScanContext): ScanTableSource.ScanRuntimeProvider = { // source table的schema val rowType = this.tableSchema.toRowDataType.getLogicalType.asInstanceOf[RowType] // 将自定义的source function传入 SourceFunctionProvider.of(new BeanSourceFunction(rowType, options), false) } } /** * 自定义的sink function,用于通知flink sql,如何将RowData数据收集起来 */ class BeanSourceFunction(rowType: RowType, options: ReadableConfig) extends RichSourceFunction[RowData] { override def run(ctx: SourceFunction.SourceContext[RowData]): Unit = { // 指定每次sink多久以后进行下一次的sink val duration = options.get(BeanOptions.DURATION) // 获取配置的重复次数,指定重发几次 val times = options.get(BeanOptions.repeatTimes) for (i <- 1 to times) { People.createList().foreach(people => { // 通过ctx收集sink的数据 ctx.collect(FlinkUtils.bean2RowData(people, rowType)) }) println(s"================${DateFormatUtils.formatCurrentDateTime()}==================") Thread.sleep(duration) } } override def cancel(): Unit = {} } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/connector/bean/BeanOptions.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.connector.bean import com.zto.fire.{JInt, JLong} import org.apache.flink.configuration.{ConfigOption, ConfigOptions} /** * 自定义sql connector支持的选项 * * @author ChengLong 2021-5-7 15:48:03 */ object BeanOptions { val TABLE_NAME: ConfigOption[String] = ConfigOptions .key("table-name") .stringType .noDefaultValue .withDescription("The name of impala table to connect.") val DURATION: ConfigOption[JLong] = ConfigOptions .key("duration") .longType() .defaultValue(3000L) .withDescription("The duration of data send.") val repeatTimes: ConfigOption[JInt] = ConfigOptions .key("repeat-times") .intType() .defaultValue(5) .withDescription("The repeat times.") } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/connector/clickhouse/ClickhouseTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.connector.clickhouse import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.common.util.JSONUtils import com.zto.fire.core.anno.connector.Kafka import com.zto.fire.examples.bean.Student import org.apache.flink.api.scala._ import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Checkpoint /** * flink clickhouse connector * * @contact Fire框架技术交流群(钉钉):35373471 */ @Checkpoint(60) @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire", autoCommit = true) // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object ClickhouseTest extends FlinkStreaming { /** * 业务逻辑代码,会被fire自动调用 */ override def process: Unit = { // sql(DDL.createStudent("t_kafka", 10)) val dstream = this.fire.createKafkaDirectStream().filter(JSONUtils.isJson(_)).map(JSONUtils.parseObject[Student](_)) dstream.createOrReplaceTempView("t_kafka") sql( """ |CREATE TABLE t_user ( | `id` BIGINT, | `name` STRING, | `age` INT, | `sex` STRING, | `score` DECIMAL, | `birthday` TIMESTAMP |) WITH ( | 'connector' = 'clickhouse', | 'url' = 'jdbc:clickhouse://node01:8123,node02:8123,node03:8123', | 'database-name' = 'study', | 'username' = 'root', | 'password' = 'fire', | 'use-local' = 'true', -- 指定为true,当分布式表写入时写的是本地表 | 'table-name' = 't_student', | 'sink.batch-size' = '10', | 'sink.flush-interval' = '3', | 'sink.max-retries' = '3' |) |""".stripMargin) sql( """ |insert into t_user |select | id, name, age, | case when sex then '男' else '女' end, | cast(length as DECIMAL), | cast(createTime as TIMESTAMP) |from t_kafka |""".stripMargin) sql( """ |select | id, name, age, | case when sex then '男' else '女' end, | cast(length as DECIMAL), | cast(createTime as TIMESTAMP) |from t_kafka |""".stripMargin).print() } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/connector/hive/HiveBatchSinkTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.connector.hive import com.zto.fire._ import com.zto.fire.core.anno.connector.Hive import com.zto.fire.flink.FlinkStreaming /** * 基于fire框架进行Flink SQL开发
* 1. Flink SQL开发官方文档——kafka connector
* 2. Flink SQL开发官方文档——jdbc connector * * @author ChengLong * @since 2.0.0 * @create 2021-01-18 17:24 * @contact Fire框架技术交流群(钉钉):35373471 */ @Hive("test") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object HiveBatchSinkTest extends FlinkStreaming { // 具体的业务逻辑放到process方法中 override def process: Unit = { this.fire.useHiveCatalog() sql("drop table if exists tmp.flink_hive_sink4") sql( """ |CREATE TABLE if not exists tmp.flink_hive_sink4 ( | bill_num BIGINT, | disorsen_man_code STRING | ) PARTITIONED BY (ds STRING) STORED AS textfile |""".stripMargin) sql( """ |insert overwrite table tmp.flink_hive_sink4 select bill_num,disorsen_man_code,ds from dw.zto_rn_bill_statis limit 10 |""".stripMargin) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/connector/hive/HiveSinkTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.connector.hive import com.zto.fire._ import com.zto.fire.flink.FlinkStreaming /** * 基于fire框架进行Flink SQL开发
* 1. Flink SQL开发官方文档——kafka connector
* 2. Flink SQL开发官方文档——jdbc connector * * @author ChengLong * @since 2.0.0 * @create 2021-01-18 17:24 * @contact Fire框架技术交流群(钉钉):35373471 */ object HiveSinkTest extends FlinkStreaming { // 具体的业务逻辑放到process方法中 override def process: Unit = { this.fire.disableOperatorChaining() sql( """ |CREATE TABLE t_student ( | `table` STRING, | `before` ROW(`id` bigint, `age` int, `name` string, `length` double, `createTime` string), -- 嵌套json的声明方式,使用ROW(),这么写很麻烦,但没办法 | `after` ROW(id bigint, age int, name string, length double, createTime string) |) WITH ( | 'connector' = 'kafka', -- 用于指定connector的类型 | 'topic' = 'fire', -- 消费的topic名称为fire | 'properties.bootstrap.servers' = 'kafka-server:9092', -- kafka的broker地址 | 'properties.group.id' = 'fire', -- 当前flink sql任务所使用的groupId | 'scan.startup.mode' = 'earliest-offset', -- 指定从什么位置开始消费 | 'format' = 'json' -- 指定解析的kafka消息为json格式 |) |""".stripMargin) sql( """ |create view v_student as |select | t.`table` as table_name, | after.id as id, -- 解析ROW类型声明的嵌套字段,直接以点的方式一级一级指定 | after.age as age, | after.name as name, | after.length as length, | after.createTime as create_time |from t_student t |""".stripMargin) this.fire.useHiveCatalog() println(this.tableEnv.getCurrentCatalog) sql( """ |CREATE TABLE if not exists tmp.flink_hive_sink ( | id BIGINT, | name STRING, | age INT |) PARTITIONED BY (ds STRING) STORED AS textfile TBLPROPERTIES ( | 'sink.partition-commit.trigger'='partition-time', -- 分区触发提交 | 'sink.partition-commit.delay'='0 s', -- 提交延迟 | 'sink.partition-commit.policy.kind'='metastore,success-file' -- 提交类型 |) |""".stripMargin) sql( """ |INSERT INTO TABLE hive.tmp.flink_hive_sink SELECT id, name, age, DATE_FORMAT(create_time, 'yyyyMMdd') FROM default_catalog.default_database.v_student |""".stripMargin) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/connector/kafka/KafkaConsumer.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.connector.kafka import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.flink.FlinkStreaming /** * * @contact Fire框架技术交流群(钉钉):35373471 */ @Config( """ |flink.sql.conf.table.exec.state.ttl = 1 ms |""") object KafkaConsumer extends FlinkStreaming { override def process: Unit = { // this.insertPrint this.streamJoin } def streamJoin: Unit = { val table = this.flink.sql( """ |CREATE TABLE kafka ( | id int, | name string, | age int, | length string, | before row, | code as before.bill_code, | bage as before.bage, | sex boolean |) WITH ( | 'connector' = 'kafka', | 'topic' = 'fire', | 'properties.bootstrap.servers' = 'kafka-server:9092', | 'properties.group.id' = 'fire', | 'scan.startup.mode' = 'latest-offset', | 'value.format' = 'json' |) |""".stripMargin) this.flink.sql( """ |CREATE TABLE kafka2 ( | id int, | name string, | age int, | length string, | before row, | code as before.bill_code, | bage as before.bage, | sex boolean |) WITH ( | 'connector' = 'kafka', | 'topic' = 'fire2', | 'properties.bootstrap.servers' = 'kafka-server:9092', | 'properties.group.id' = 'fire2', | 'scan.startup.mode' = 'latest-offset', | 'value.format' = 'json' |) |""".stripMargin) sql( """ |create view kafka_join |as |select | k1.id, | k2.name, | k2.before.bill_code as bill_code, | k1.bage, | k2.bage |from kafka k1 left join kafka2 k2 | on k1.before.bill_code=k2.code |where k1.bage > 10 |""".stripMargin) sql( """ |select * from kafka_join |""".stripMargin).print() } def insertPrint: Unit = { this.flink.sql( """ |CREATE TABLE kafka ( | id int, | name string, | age int, | length string, | before row, | -- code as before.bill_code, | -- bage as before.bage, | sex boolean |) WITH ( | 'connector' = 'kafka', | 'topic' = 'fire', | 'properties.bootstrap.servers' = 'kafka-server:9092', | 'properties.group.id' = 'fire', | 'scan.startup.mode' = 'latest-offset', | 'value.format' = 'json' |) |""".stripMargin) sql( """ |create table `print` with('connector' = 'print') like kafka (EXCLUDING ALL) |""".stripMargin) sql( """ |insert into print select * from kafka |""".stripMargin) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/connector/rocketmq/RocketMQConnectorTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.connector.rocketmq import com.zto.fire.core.anno.lifecycle.{Step1, Step2, Step3} import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Streaming /** * RocketMQ connector * * @author ChengLong * @since 1.0.0 * @create 2021-01-18 17:24 * @contact Fire框架技术交流群(钉钉):35373471 */ @Streaming(parallelism = 2, interval = 30) object RocketMQConnectorTest extends FlinkStreaming { @Step1("定义RocketMQ源表") def source: Unit = { sql(""" |CREATE table source ( | id int, | name string, | age int, | length double, | data DECIMAL(10, 5) |) with ( | 'connector'='fire-rocketmq', | 'format'='json', | 'rocket.brokers.name'='bigdata_test', | 'rocket.topics'='fire', | 'rocket.group.id'='fire', | 'rocket.consumer.tag'='*' |) |""".stripMargin) } @Step2("定义目标表") def sink: Unit = { sql( """ |CREATE table sink ( | id int, | name string, | age int, | length double, | data DECIMAL(10, 5) |) with ( | 'connector'='fire-rocketmq', | 'format'='json', | 'rocket.brokers.name'='bigdata_test', | 'rocket.topics'='fire2', | 'rocket.consumer.tag'='*', | 'rocket.sink.parallelism'='1' |) |""".stripMargin) } @Step3("数据sink") def insert: Unit = { sql(""" |insert into sink select * from source |""".stripMargin) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/connector/rocketmq/RocketTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.connector.rocketmq import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.core.anno.connector.RocketMQ import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Checkpoint import org.apache.flink.api.scala._ /** * Flink流式计算任务消费rocketmq * * @author ChengLong * @since 2.0.0 * @create 2021-5-13 14:26:24 * @contact Fire框架技术交流群(钉钉):35373471 */ @Checkpoint(60) @Config("default.parallelism=2") @RocketMQ(brokers = "bigdata_test", topics = "fire", groupId = "fire", tag = "*", startingOffset = "latest") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object RocketTest extends FlinkStreaming { override def process: Unit = { // 1. createRocketMqPullStreamWithTag()返回的是三元组,分别是:(tag, key, value) this.fire.createRocketMqPullStreamWithTag().setParallelism(1).map(t => { this.logInfo("消息:" + t._3) t._3 }).print() // 2. createRocketMqPullStreamWithKey()返回的是二元组,分别是:(key, value) // this.fire.createRocketMqPullStreamWithKey().map(t => t._2).print() // 3. createRocketMqPullStream()返回的是消息体 // this.fire.createRocketMqPullStream() // 从另一个rocketmq中消费数据 } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/connector/sql/DDL.scala ================================================ package com.zto.fire.examples.flink.connector.sql object DDL { /** * 创建自动生成数据的source table * * @param tableName 表名 * @param rowsPerSec 每秒产生的记录数 * * @return ddl */ def createStudent(tableName: String = "t_student", rowsPerSec: Int = 5000): String = { s""" | CREATE TABLE ${tableName} ( | id BIGINT, | name STRING, | age INT, | createTime TIMESTAMP(13), | length DECIMAL(5, 2), | sex Boolean |) WITH ( | 'connector' = 'datagen', | | 'rows-per-second'='${rowsPerSec}', -- 5000/s | | 'fields.id.min'='1', -- id字段,1到1000之间 | 'fields.id.max'='1000', | | 'fields.name.length'='5', -- name字段,长度为5 | | 'fields.age.min'='1', -- age字段,1到120岁 | 'fields.age.max'='120', | | 'fields.length.min'='50', -- length字段,最小1000,最大10000 | 'fields.length.max'='220' |) |""".stripMargin } /** * 创建print connector * * @param printTableName sink print 表名 * @param likeTableName like的source 表名 * @return 建表语句 */ def createPrintLike(printTableName: String = "t_print_table", likeTableName: String): String = { s""" |CREATE TABLE ${printTableName} WITH ('connector' = 'print') |LIKE ${likeTableName} (EXCLUDING ALL) |""".stripMargin } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/connector/sql/DataGenTest.scala ================================================ package com.zto.fire.examples.flink.connector.sql import com.zto.fire.core.anno.lifecycle.{Step1, Step2, Step3} import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Streaming /** * DataGen connector使用 * * @author ChengLong * @contact Fire框架技术交流群(钉钉):35373471 */ @Streaming(interval = 10) object DataGenTest extends FlinkStreaming { private lazy val dataGenTable = "t_student" private lazy val sinkPrintTable = "t_print_table" @Step1("定义源表表结构") def sourceTable: Unit = { sql( s""" | CREATE TABLE ${this.dataGenTable} ( | id BIGINT, | name STRING, | age INT, | createTime TIMESTAMP(13), | sex Boolean |) WITH ( | 'connector' = 'datagen', | 'rows-per-second'='100', -- 5000/s | 'fields.id.min'='1', -- id字段,1到1000之间 | 'fields.id.max'='1000', | 'fields.name.length'='5', -- name字段,长度为5 | 'fields.age.min'='1', -- age字段,1到120岁 | 'fields.age.max'='120' |) |""".stripMargin) } @Step2("定义目标表结构") def destTable: Unit = { sql( s""" |CREATE TABLE ${this.sinkPrintTable} WITH ('connector' = 'print') |LIKE ${this.dataGenTable} (EXCLUDING ALL) |""".stripMargin) } @Step3("执行insert语句") def insertStatement: Unit = { sql( s""" |insert into ${this.sinkPrintTable} |select | id, name, age, createTime, sex |from ${this.dataGenTable} |group by id, name, age, createTime, sex |""".stripMargin) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/lineage/FlinkSqlLineageTest.scala ================================================ package com.zto.fire.examples.flink.lineage import com.zto.fire._ import com.zto.fire.common.util.{JSONUtils, ThreadUtils} import com.zto.fire.core.anno.connector.Hive import com.zto.fire.core.anno.lifecycle.{Step1, Step2, Step3, Step4} import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Streaming import com.zto.fire.flink.sync.FlinkLineageAccumulatorManager import java.util.concurrent.TimeUnit /** * 用于解析flink sql血缘依赖 * * @author ChengLong 2022-09-13 14:20:13 * @since 2.0.0 */ @Hive("test") @Streaming(interval = 60, parallelism = 2) object FlinkSqlLineageTest extends FlinkStreaming { @Step1("血缘信息输出") def lineage: Unit = { // 定义hive表前先切换到hive catalog ThreadUtils.scheduleAtFixedRate({ println(s"累加器值:" + JSONUtils.toJSONString(FlinkLineageAccumulatorManager.getValue) + "\n\n") }, 0, 10, TimeUnit.SECONDS) } @Step2("定义RocketMQ源表") def source: Unit = { sql(""" |CREATE table source ( | id int, | name string, | age int, | length double, | data DECIMAL(10, 5) |) with ( | 'connector'='fire-rocketmq', | 'format'='json', | 'rocket.brokers.name'='bigdata_test', | 'rocket.topics'='fire', | 'rocket.group.id'='fire', | 'rocket.consumer.tag'='*' |) |""".stripMargin) } @Step3("定义目标表") def sink: Unit = { sql( """ |CREATE table sink ( | id int, | name string, | age int, | length double, | data DECIMAL(10, 5) |) with ( | 'connector'='fire-rocketmq', | 'format'='json', | 'rocket.brokers.name'='bigdata_test', | 'rocket.topics'='fire2', | 'rocket.consumer.tag'='*', | 'rocket.sink.parallelism'='1' |) |""".stripMargin) } @Step4("数据sink") def insert: Unit = { sql(""" |insert into sink select * from source |""".stripMargin) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/lineage/LineageTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.lineage import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.common.enu.Datasource import com.zto.fire.common.util.{DatasourceDesc, DateFormatUtils, JSONUtils, ThreadUtils} import com.zto.fire.core.anno.connector._ import com.zto.fire.core.anno.lifecycle.{Process, Step1} import com.zto.fire.examples.bean.Student import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Streaming import com.zto.fire.flink.sync.FlinkLineageAccumulatorManager import com.zto.fire.hbase.HBaseConnector import com.zto.fire.predef.{JConcurrentHashMap, JHashSet} import org.apache.flink.api.scala._ import java.util.concurrent.TimeUnit @HBase("test") @Config("""fire.lineage.run.initialDelay=10""") @Streaming(interval = 60, unaligned = true, parallelism = 2) // 100s做一次checkpoint,开启非对齐checkpoint @RocketMQ(brokers = "bigdata_test", topics = "fire", groupId = "fire") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") @Jdbc(url = "jdbc:mysql://mysql-server:3306/fire", username = "root", password = "root") object LineageTest extends FlinkStreaming { private val hbaseTable = "fire_test_1" private lazy val tableName = "spark_test" @Process def kafkaSource: Unit = { this.fire.createKafkaDirectStream().print() val dstream = this.fire.createRocketMqPullStream() dstream.map(t => { val timestamp = DateFormatUtils.formatCurrentDateTime() val insertSql = s"INSERT INTO $tableName (name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" this.fire.jdbcUpdate(insertSql, Seq("admin", 12, timestamp, 10.0, 1)) HBaseConnector.get[Student](hbaseTable, classOf[Student], Seq("1")) t }).print() sql(""" |CREATE table source ( | id int, | name string, | age int, | length double, | data DECIMAL(10, 5) |) with ( | 'connector'='fire-rocketmq', | 'format'='json', | 'rocket.brokers.name'='bigdata_test', | 'rocket.topics'='fire', | 'rocket.group.id'='fire', | 'rocket.consumer.tag'='*' |); | |CREATE table sink ( | id int, | name string, | age int, | length double, | data DECIMAL(10, 5) |) with ( | 'connector'='fire-rocketmq', | 'format'='json', | 'rocket.brokers.name'='bigdata_test', | 'rocket.topics'='fire2', | 'rocket.consumer.tag'='*', | 'rocket.sink.parallelism'='1' |); | |insert into sink select * from source; |""".stripMargin) } @Step1("获取血缘信息") def lineage: Unit = { ThreadUtils.scheduleAtFixedRate({ println(s"累加器值:" + JSONUtils.toJSONString(FlinkLineageAccumulatorManager.getValue)) }, 0, 60, TimeUnit.SECONDS) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/module/ArthasTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.module import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.common.util.JSONUtils import com.zto.fire.core.anno.connector.Kafka import com.zto.fire.examples.bean.Student import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Checkpoint import org.apache.flink.api.scala._ import org.apache.flink.streaming.api.functions.KeyedProcessFunction import org.apache.flink.streaming.api.scala.KeyedStream import org.apache.flink.util.Collector /** * Flink Streaming与Arthas集成测试 * * @contact Fire框架技术交流群(钉钉):35373471 */ @Config( """ |# 直接从配置文件中拷贝过来即可 | #注释信息 |fire.acc.timer.max.size=30 |fire.acc.log.max.size=20 |fire.analysis.arthas.enable=false |fire.log.level.conf.org.apache.flink=warn |fire.analysis.arthas.container.enable=false |fire.rest.filter.enable=true |""") @Checkpoint(interval = 10, concurrent = 1, pauseBetween = 60, timeout = 60) @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire", autoCommit = true) // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object ArthasTest extends FlinkStreaming { /** * 业务逻辑代码,会被fire自动调用 */ override def process: Unit = { val dstream = this.fire.createKafkaDirectStream().filter(json => JSONUtils.isJson(json)).map(json => JSONUtils.parseObject[Student](json)).setParallelism(2) val value: KeyedStream[Student, JLong] = dstream.keyBy(t => t.getId) this.printConf value.process(new KeyedProcessFunction[JLong, Student, String]() { override def processElement(value: Student, ctx: KeyedProcessFunction[_root_.com.zto.fire.JLong, Student, String]#Context, out: Collector[String]): Unit = { printConf val state = this.getState[Long]("sum") state.update(state.value() + 1) println(s"当前key=${value.getId} sum=${state.value()}") out.collect(value.getName) } }).print("name") } def printConf: Unit = { println("================================") println("fire.thread.pool.size=" + this.conf.getInt("fire.thread.pool.size", -1)) println("fire.thread.pool.schedule.size=" + this.conf.getInt("fire.thread.pool.schedule.size", -1)) println("================================") } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/module/ExceptionTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.module import com.zto.fire._ import com.zto.fire.common.util.{JSONUtils, ShutdownHookManager} import com.zto.fire.core.anno.connector._ import com.zto.fire.examples.bean.Student import com.zto.fire.examples.flink.sql.RocketMQConnectorTest.sql import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Streaming import org.apache.flink.api.scala._ @Streaming(interval = 10, unaligned = true, parallelism = 2) // 100s做一次checkpoint,开启非对齐checkpoint @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") object ExceptionTest extends FlinkStreaming { override def process: Unit = { this.testSqlException // this.testApiException } /** * 测试SQL异常捕获 */ def testSqlException: Unit = { sql(""" |CREATE table source ( | id int, | name string, | age int, | length double, | data DECIMAL(10, 5) |) with ( | 'connector'='fire-rocketmq', | 'format'='json', | 'rocket.brokers.name'='bigdata_test', | 'rocket.topics'='fire', | 'rocket.group.id'='fire', | 'rocket.consumer.tag'='*' |); | |CREATE table sink ( | id int, | name string, | age int, | length double, | data DECIMAL(10, 5) |) with ( | 'connector'='fire-rocketmq', | 'format'='json', | 'rocket.brokers.name'='bigdata_test', | 'rocket.topics'='fire2', | 'rocket.consumer.tag'='*', | 'rocket.sink.parallelism'='1' |); | |insert into select * from source; |""".stripMargin) } /** * 测试API的异常捕获 */ def testApiException: Unit = { val dstream = this.fire.createKafkaDirectStream() dstream.map(t => { val student = JSONUtils.parseObject[Student](t) if (student.getId % 2 != 0) { val a = 1 / 0 } t }).print() } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/sql/HiveDimDemo.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.sql import com.zto.fire._ import com.zto.fire.core.anno.connector.Hive import com.zto.fire.core.anno.lifecycle._ import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Streaming /** * 用于演示通过flink sql读取hive维表表 * * @author ChengLong 2022-08-23 14:21:55 * @since 2.0.0 */ @Hive("test") @Streaming(interval = 60, parallelism = 2) object HiveDimDemo extends FlinkStreaming { @Step1("创建hive表数据源") def hiveTable: Unit = { // 定义hive表前先切换到hive catalog this.fire.useHiveCatalog() sql( """ |CREATE TABLE if not exists `t_hive_table` ( | `id` BIGINT, | `name` STRING, | `age` INT, | `createTime` TIMESTAMP, | `length` double |) PARTITIONED BY (ds STRING) STORED AS orc TBLPROPERTIES ( | 'partition.time-extractor.timestamp-pattern'='$ds', | 'sink.partition-commit.trigger'='process-time', | 'sink.partition-commit.delay'='1 min', | 'sink.partition-commit.policy.kind'='metastore,success-file', | 'lookup.join.cache.ttl' = '60 s' |) |""".stripMargin) } @Step2("创建kafka数据源") def kafkaTable: Unit = { this.fire.useDefaultCatalog sql( """ |-- 1. 定义kafka connector |CREATE TABLE t_kafka_fire ( | `id` BIGINT, | `name` STRING, | `age` INT, | `createTime` TIMESTAMP(3), | `length` double, | proctime as proctime() |) WITH ( | 'datasource' = 'kafka_test', -- 数据源别名定义在common.properties中,也可通过@Config注解定义 | 'scan.startup.mode' = 'earliest-offset', | 'format' = 'json' |); |""".stripMargin) } @Step3("kafka数据与hive维表关联") def sinkToHive: Unit = { sql( """ |select | t1.id, t2.name |from t_kafka_fire t1 | left join hive.tmp.t_hive_table for system_time as of t1.proctime as t2 on t1.id=t2.id |group by t1.id, t2.name |""".stripMargin).print() } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/sql/HiveWriteDemo.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.sql import com.zto.fire._ import com.zto.fire.core.anno.connector.Hive import com.zto.fire.core.anno.lifecycle._ import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Streaming /** * 用于演示通过flink sql写hive表 * * @author ChengLong 2022-08-23 13:03:20 * @since 2.0.0 * @contact Fire框架技术交流群(钉钉):35373471 */ @Hive("test") @Streaming(interval = 60, parallelism = 2) object HiveWriteDemo extends FlinkStreaming { @Step1("创建hive表数据源") def hiveTable: Unit = { // 定义hive表前先切换到hive catalog this.fire.useHiveCatalog() sql( """ |CREATE TABLE if not exists `t_hive_table` ( | `id` BIGINT, | `name` STRING, | `age` INT, | `createTime` TIMESTAMP, | `length` double |) PARTITIONED BY (ds STRING) STORED AS orc TBLPROPERTIES ( | 'partition.time-extractor.timestamp-pattern'='$ds', | 'sink.partition-commit.trigger'='process-time', | 'sink.partition-commit.delay'='1 min', | 'sink.partition-commit.policy.kind'='metastore,success-file' |) |""".stripMargin) } @Step2("创建kafka数据源") def kafkaTable: Unit = { this.fire.useDefaultCatalog sql( """ |-- 1. 定义kafka connector |CREATE TABLE t_kafka_fire ( | `id` BIGINT, | `name` STRING, | `age` INT, | `createTime` TIMESTAMP(3), | `length` double |) WITH ( | 'datasource' = 'kafka_test', -- 数据源别名定义在common.properties中,也可通过@Config注解定义 | 'scan.startup.mode' = 'earliest-offset', | 'format' = 'json' |); |""".stripMargin) } @Step3("将kafka数据写入到hive表中") def sinkToHive: Unit = { sql( """ |insert into hive.tmp.t_hive_table |select | `id`, | `name`, | `age`, | `createTime`, | `length`, | DATE_FORMAT(LOCALTIMESTAMP,'yyyyMMdd') as ds |from `t_kafka_fire` d |""".stripMargin) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/sql/JdbcDimDemo.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.sql import com.zto.fire._ import com.zto.fire.core.anno.lifecycle._ import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Streaming /** * 用于演示通过flink sql读取mysql维表表 * * @author ChengLong 2022-08-23 14:21:55 * @since 2.0.0 */ @Streaming(interval = 60, parallelism = 2) object JdbcDimDemo extends FlinkStreaming { @Step1("创建mysql维表数据源") def dimTable: Unit = { sql( """ |CREATE TABLE t_mysql_dim ( | `id` BIGINT, | `name` STRING, | `ds` STRING, | `count_value` BIGINT, | PRIMARY KEY (id) NOT ENFORCED |) WITH ( | 'datasource' = 'jdbc_test', -- 数据源别名定义在common.properties中,也可通过@Config注解定义 | 'table-name' = 't_flink_agg', | 'lookup.cache.max-rows'='1000', | 'lookup.cache.ttl' = '1h', | 'lookup.max-retries' = '3' |); |""".stripMargin) } @Step2("创建kafka数据源") def kafkaTable: Unit = { this.fire.useDefaultCatalog sql( """ |-- 1. 定义kafka connector |CREATE TABLE t_kafka_fire ( | `id` BIGINT, | `name` STRING, | `age` INT, | `createTime` TIMESTAMP(3), | `length` double, | proctime as proctime() |) WITH ( | 'datasource' = 'kafka_test', -- 数据源别名定义在common.properties中,也可通过@Config注解定义 | 'scan.startup.mode' = 'earliest-offset', | 'format' = 'json' |) |""".stripMargin) } @Step3("kafka数据与mysql维表关联") def showJoin: Unit = { sql( """ |select | t1.id, t2.name |from t_kafka_fire t1 | left join t_mysql_dim for system_time as of t1.proctime as t2 on t1.id=t2.id |group by t1.id, t2.name |""".stripMargin).print() } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/sql/RocketMQConnectorTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.sql import com.zto.fire.core.anno.lifecycle.{Step1, Step2, Step3} import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Streaming /** * RocketMQ connector * * @author ChengLong * @since 1.0.0 * @create 2021-01-18 17:24 * @contact Fire框架技术交流群(钉钉):35373471 */ @Streaming(parallelism = 2, interval = 30) object RocketMQConnectorTest extends FlinkStreaming { @Step1("定义RocketMQ源表") def source: Unit = { sql(""" |CREATE table source ( | id int, | name string, | age int, | length double, | data DECIMAL(10, 5) |) with ( | 'connector'='fire-rocketmq', | 'format'='json', | 'rocket.brokers.name'='bigdata_test', | 'rocket.topics'='fire', | 'rocket.group.id'='fire', | 'rocket.consumer.tag'='*' |) |""".stripMargin) } @Step2("定义目标表") def sink: Unit = { sql( """ |CREATE table sink ( | id int, | name string, | age int, | length double, | data DECIMAL(10, 5) |) with ( | 'connector'='fire-rocketmq', | 'format'='json', | 'rocket.brokers.name'='bigdata_test', | 'rocket.topics'='fire2', | 'rocket.consumer.tag'='*', | 'rocket.sink.parallelism'='1' |) |""".stripMargin) } @Step3("数据sink") def insert: Unit = { sql(""" |insert into sink select * from source |""".stripMargin) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/sql/SimpleSqlDemo.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.sql import com.zto.fire.flink.FlinkStreaming import com.zto.fire._ import com.zto.fire.flink.anno.Streaming /** * 基于Fire框架开发Flink SQL的示例代码 * * @author ChengLong 2022-08-23 16:17:36 * @since 2.3.1 * @contact Fire框架技术交流群(钉钉):35373471 */ @Streaming(interval = 60, parallelism = 2) object SimpleSqlDemo extends FlinkStreaming { override def process: Unit = { sql( """ |CREATE TABLE t_student ( | `table` STRING, | `before` ROW(`id` bigint, `age` int, `name` string, `length` double, `createTime` string), -- 嵌套json的声明方式,使用ROW(),这么写很麻烦,但没办法 | `after` ROW(id bigint, age int, name string, length double, createTime string), | order_time TIMESTAMP(3), | WATERMARK FOR order_time AS order_time - INTERVAL '50' SECOND |) WITH ( | 'connector' = 'kafka', -- 用于指定connector的类型 | 'topic' = 'fire-sql', -- 消费的topic名称为fire | 'properties.bootstrap.servers' = 'kafka-server:9092', -- kafka的broker地址 | 'properties.group.id' = 'fire', -- 当前flink sql任务所使用的groupId | 'scan.startup.mode' = 'earliest-offset', -- 指定从什么位置开始消费 | 'format' = 'json' -- 指定解析的kafka消息为json格式 |) |""".stripMargin) sql( """ |create view v_student as |select | t.`table` as table_name, | after.id as id, -- 解析ROW类型声明的嵌套字段,直接以点的方式一级一级指定 | after.age as age, | after.name as name, | after.length as length, | order_time as create_time |from t_student t |""".stripMargin) sql( """ |CREATE TABLE sink ( | id BIGINT, | name STRING, | age INT, | `count` bigint, | PRIMARY KEY (id) NOT ENFORCED -- 指定主键字段,如果insert语句后面的select代码聚合算子(group by),则必须指定主键,用于数据的更新操作 |) WITH ( | 'connector' = 'jdbc', -- 指定当前connector为jdbc类型 | 'url' = 'jdbc:mysql://mysql-server:3306/fire', -- jdbc的url | 'table-name' = 'flink_sql_test', -- 指定往哪张数据库表中写数据,表示往mysql的名为flink_sql_test的表插入或更新数据 | 'driver' = 'com.mysql.jdbc.Driver', -- jdbc的驱动类名 | 'username' = 'root', -- jdbc的用户名 | 'password' = 'fire', -- jdbc的密码 | 'sink.buffer-flush.interval' = '10s', -- 标识每隔10s钟将数据flush一次到mysql中,避免逐条insert效率低 | 'sink.buffer-flush.max-rows' = '3', -- 标识积累满3条执行一次批量insert,通用避免逐条insert,和sink.buffer-flush.interval先符合为准 | 'sink.max-retries' = '3' -- 插入失败时重试几次 |) |""".stripMargin) sql( """ |CREATE TABLE sink2 ( | id BIGINT, | name STRING, | age INT, | `count` bigint, | PRIMARY KEY (id) NOT ENFORCED -- 指定主键字段,如果insert语句后面的select代码聚合算子(group by),则必须指定主键,用于数据的更新操作 |) WITH ( | 'connector' = 'jdbc', -- 指定当前connector为jdbc类型 | 'url' = 'jdbc:mysql://mysql-server:3306/fire', -- jdbc的url | 'table-name' = 'flink_sql_test2', -- 指定往哪张数据库表中写数据,表示往mysql的名为flink_sql_test的表插入或更新数据 | 'driver' = 'com.mysql.jdbc.Driver', -- jdbc的驱动类名 | 'username' = 'root', -- jdbc的用户名 | 'password' = 'fire', -- jdbc的密码 | 'sink.buffer-flush.interval' = '10s', -- 标识每隔10s钟将数据flush一次到mysql中,避免逐条insert效率低 | 'sink.buffer-flush.max-rows' = '3', -- 标识积累满3条执行一次批量insert,通用避免逐条insert,和sink.buffer-flush.interval先符合为准 | 'sink.max-retries' = '3' -- 插入失败时重试几次 |) |""".stripMargin) sql(""" |insert into sink |select id, name, age, sum(1) as `count` |from v_student |group by id,name,age; | |insert into sink2 |select id, name, age, sum(1) as `count` |from v_student |group by id,name,age |""".stripMargin) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/sql/SqlJoinDemo.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.sql import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.core.anno.lifecycle.Process import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Streaming /** * 本示例代码用于演示以下使用场景: * 1. 演示如何通过配置方式替换sql中with的options选项:'datasource'='alias' * 2. 演示如何一次性执行多条sql语句:以分号分割 * * @author ChengLong 2022-08-22 17:18:49 * @since 2.3.1 * @contact Fire框架技术交流群(钉钉):35373471 */ @Streaming(interval = 100, unaligned = true, parallelism = 2) object SqlJoinDemo extends FlinkStreaming { /** * 建议将sql的options定义到[resources/common.properties]中,可被所有任务所加载 * 也可以定义到实时平台中,fire框架在启动时通过接口调用获取,当然,也可以通过@Config注解定义 */ @Process def executeSql: Unit = { sql( s""" |-- 1. 定义kafka connector |CREATE TABLE t_kafka_fire ( | `id` BIGINT, | `name` STRING, | `age` INT, | `createTime` TIMESTAMP(3), | `length` double |) WITH ( | 'datasource' = 'kafka_test', -- 数据源别名定义在common.properties中,也可通过@Config注解定义 | 'scan.startup.mode' = 'earliest-offset', | 'format' = 'json' |); | |-- 2. 定义kafka connector(另一个topic,用于双流join) |CREATE TABLE t_kafka_fire2 ( | `id` BIGINT, | `name` STRING, | `age` INT, | `createTime` TIMESTAMP(3), | `length` double |) WITH ( | 'datasource' = 'kafka_test2', -- 数据源别名定义在common.properties中,也可通过@Config注解定义 | 'scan.startup.mode' = 'earliest-offset', | 'format' = 'json' |); | |-- 3. 定义mysql目标表 |CREATE TABLE t_flink_agg ( | `id` BIGINT, | `name` STRING, | `ds` STRING, | `count_value` BIGINT, | PRIMARY KEY (id) NOT ENFORCED |) WITH ( | 'datasource' = 'jdbc_test', -- 数据源别名定义在common.properties中,也可通过@Config注解定义 | 'table-name' = 't_flink_agg', | 'sink.buffer-flush.interval' = '10s', | 'sink.buffer-flush.max-rows' = '3', | 'sink.max-retries' = '3' |); | |-- 将双流join的数据写入到mysql表中 |insert into t_flink_agg(id, name, ds, count_value) |select | k1.id, | k2.name, | DATE_FORMAT(k1.createTime, 'yyyyMMdd') as ds, | count(1) |from t_kafka_fire k1 left join t_kafka_fire2 k2 on k1.id=k2.id |group by k1.id, k2.name, DATE_FORMAT(k1.createTime, 'yyyyMMdd') |""".stripMargin).show() } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/stream/ConfigCenterTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.stream import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.common.util.JSONUtils import com.zto.fire.core.anno.connector.Kafka import com.zto.fire.examples.bean.Student import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Checkpoint import org.apache.flink.api.scala._ import org.apache.flink.streaming.api.functions.KeyedProcessFunction import org.apache.flink.streaming.api.scala.KeyedStream import org.apache.flink.util.Collector /** * 基于Fire进行Flink Streaming开发 */ @Config( """ |# 直接从配置文件中拷贝过来即可 |fire.acc.timer.max.size=30 |fire.acc.log.max.size=20 |fire.conf.test=java |""") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") @Checkpoint(interval = 10, concurrent = 1, pauseBetween = 60, timeout = 60) // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object ConfigCenterTest extends FlinkStreaming { /** * 业务逻辑代码,会被fire自动调用 */ override def process: Unit = { val dstream = this.fire.createKafkaDirectStream().filter(json => JSONUtils.isJson(json)).map(json => JSONUtils.parseObject[Student](json)).setParallelism(2) val value: KeyedStream[Student, JLong] = dstream.keyBy(t => t.getId) this.printConf value.process(new KeyedProcessFunction[JLong, Student, String]() { override def processElement(value: Student, ctx: KeyedProcessFunction[_root_.com.zto.fire.JLong, Student, String]#Context, out: Collector[String]): Unit = { printConf val state = this.getState[String]("sum") state.update(state.value() + JSONUtils.toJSONString(value)) out.collect(value.getName) } }).print("name") } /** * 配置信息打印 * * ================================ * fire.thread.pool.size=6 * fire.thread.pool.schedule.size=5 * fire.acc.timer.max.size=30 * fire.acc.log.max.size=22 * fire.jdbc.query.partitions=13 * fire.conf.test=flink * ================================ */ def printConf: Unit = { println("================================") println("fire.thread.pool.size=" + this.conf.getInt("fire.thread.pool.size", -1)) println("fire.thread.pool.schedule.size=" + this.conf.getInt("fire.thread.pool.schedule.size", -1)) println("fire.acc.timer.max.size=" + this.conf.getInt("fire.acc.timer.max.size", -1)) println("fire.acc.log.max.size=" + this.conf.getInt("fire.acc.log.max.size", -1)) println("fire.jdbc.query.partitions=" + this.conf.getInt("fire.jdbc.query.partitions", -1)) println("fire.conf.test=" + this.conf.getString("fire.conf.test")) println("================================") } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/stream/FlinkHiveTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.stream import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.common.util.JSONUtils import com.zto.fire.core.anno.connector.{Hive, Kafka} import com.zto.fire.examples.bean.Student import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Checkpoint import org.apache.flink.api.scala._ /** * Flink 整合hive维表的例子,在流中join hive 维表数据 * 注:流关联hive维表需要以下6个步骤: * 1. 开启hint:sql.conf.table.dynamic-table-options.enabled=true * 2. 切换为hive catalog以及hive方言:this.fire.useHiveCatalog() * 3. SQL查询维表信息并附有hint: select * from hiveTable /*+ OPTIONS('streaming-source.enable' = 'true','streaming-source.monitor-interval' = '15 s','streaming-source.partition-order'='create-time')*/ * 4. 切回默认catalog以及方言:this.fire.useDefaultCatalog * 5. 将hive维表数据注册为临时表:dimTable.createOrReplaceTempView("baseorganize") * 6. SQL中流关联hive临时表:join xxx on * * @author ChengLong 2020年4月3日 09:05:53 */ @Config( """ |kafka.brokers.name = bigdata_test |kafka.topics = fire |kafka.group.id=fire |# 1. 读取hive维表必须启用该配置 |sql.conf.table.dynamic-table-options.enabled=true |""") @Hive("test") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire", autoCommit = true) @Checkpoint(interval = 60, concurrent = 1, pauseBetween = 60, timeout = 60) // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object FlinkHiveTest extends FlinkStreaming { override def process: Unit = { val dstream = this.fire.createKafkaDirectStream().filter(json => JSONUtils.isJson(json)) .map(json => JSONUtils.parseObject[Student](json)) .setParallelism(2) dstream.createOrReplaceTempView("student") // 2. 切换hive catalog以及方言,表示从hive中读取维表数据 this.fire.useHiveCatalog() val dimTable = sql( """ |select id,shortname |from tmp.baseorganize_flink |-- 3. 指定以下hit,用于指明flink定时ttl掉维表数据 |/*+ OPTIONS('streaming-source.enable' = 'true', -- 开启流式读取 Hive 数据 |'streaming-source.partition.include' = 'all', -- 1.latest 属性: 只读取最新分区数据。2.all: 读取全量分区数据 ,默认值为 all,表示读所有分区,latest 只能用在 temporal join 中,用于读取最新分区作为维表,不能直接读取最新分区数据 |'streaming-source.monitor-interval' = '1 h', -- 指定ttl的间隔时间,监听新分区生成的时间、不宜过短 、最短是1 个小时,因为目前的实现是每个 task 都会查询 metastore,高频的查可能会对metastore 产生过大的压力。需要注意的是,1.12.1 放开了这个限制,但仍建议按照实际业务不要配个太短的 interval |'streaming-source.partition-order'='create-time')*/ -- 非hive分区表,需要指定create-time,如果是分区表:partition-name 使用默认分区名称顺序加载最新分区2.create-time 使用分区文件创建时间顺序, 3. partition-time 使用分区时间顺序 |""".stripMargin) // 4. 切换回默认的catalog后再将hive维表数据注册为临时表,避免在default catalog查询不到baseorganize这张临时表 this.fire.useDefaultCatalog // 5. 将hive维表数据注册为临时表 // dimTable.createOrReplaceTempView("baseorganize") // 5. 关联流表与hive维表,当hive维表更新后flink会自动周期性的刷新维表数据,并体现在关联的结果中 sql( s""" |select s.id,s.name,b.shortname |from student s |left join $dimTable b |on s.id=b.id |""".stripMargin).print() } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/stream/FlinkPartitioner.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.stream import com.zto.fire._ import com.zto.fire.core.anno.connector.{Hive, Kafka} import com.zto.fire.flink.FlinkStreaming import org.apache.flink.api.common.functions.Partitioner import org.apache.flink.api.scala._ /** * flink重分区 * * @author ChengLong 2020-4-10 09:50:26 */ @Hive("test") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire", autoCommit = true) // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object FlinkPartitioner extends FlinkStreaming { override def process: Unit = { val dstream = this.fire.createCollectionStream(1 to 10) // 将当前所有输出值都输出到下游算子的第一个实例中,会导致严重的性能问题,谨慎使用 dstream.global.print() // 将当前输出中的每一条记录随机输出到下游的每一个实例中,可显著解决数据倾斜问题 dstream.shuffle.print() // 将当前输出以循环的方式输出到下游算子的每一个实例中,可显著解决数据倾斜问题,比shuffle方式分配的更均匀 dstream.rebalance.print() // 基于上下游Operator的并行度,将记录以循环的方式输出到下游Operator的每个实例。举例: 上游并行度是2,下游是4, // 则上游一个并行度以循环的方式将记录输出到下游的两个并行度上;上游另一个并行度以循环的方式将记录输出到下游另两个并行度上。 // 若上游并行度是4,下游并行度是2,则上游两个并行度将记录输出到下游一个并行度上;上游另两个并行度将记录输出到下游另一个并行度上 // 相当于小范围的rebalance操作 dstream.rescale.print() // 将上游数据全部输出到下游每一个算子的实例中,适合于大数据集Join小数据集的场景 dstream.broadcast.print() // 将记录输出到下游本地的operator实例,ForwardPartitioner分区器要求上下游算子并行度一样,上下游Operator同属一个SubTasks dstream.forward.print() // 将记录按Key的Hash值输出到下游Operator实例 // dstream.map(t => (t, t)).keyBy(KeySelector[Int, Int]()) // 自定义分区,需继承Partitioner并实现自己的partition分区算法 dstream.map(t => (t, t)).partitionCustom(new HashPartitioner, 0).print() } /** * Flink自定义分区 */ class HashPartitioner extends Partitioner[Int] { override def partition(key: Int, numPartitions: Int): Int = { if (key % 2 == 0) { 0 } else { 1 } } } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/stream/FlinkRetractStreamTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.stream import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.common.util.JSONUtils import com.zto.fire.core.anno.connector.{Hive, Kafka} import com.zto.fire.examples.bean.Student import com.zto.fire.flink.FlinkStreaming import org.apache.flink.api.scala._ import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment import org.apache.flink.types.Row @Config( """ |flink.fire.rest.filter.enable = false |flink.default.parallelism = 8 |flink.max.parallelism = 8 |""") @Hive("test") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire", autoCommit = true) // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object FlinkRetractStreamTest extends FlinkStreaming { val tableName = "spark_test" /** * 生命周期方法:具体的用户开发的业务逻辑代码 * 注:此方法会被自动调用,不需要在main中手动调用 */ override def process: Unit = { val dstream = this.fire.createKafkaDirectStream().map(json => JSONUtils.parseObject[Student](json)).shuffle dstream.createOrReplaceTempView("student") val table = this.fire.sqlQuery("select name, age, createTime, length, sex from student group by name, age, createTime, length, sex") val fields = "name, age, createTime, length, sex" val sql = s"INSERT INTO $tableName ($fields) VALUES (?, ?, ?, ?, ?)" // 方式一、table中的列顺序和类型需与jdbc sql中的占位符顺序保持一致 table.jdbcBatchUpdate(sql, keyNum = 10) // 方式二、自定义row取数规则,该种方式较灵活,可定义取不同的列,顺序仍需与sql占位符保持一致 table.jdbcBatchUpdate2(sql, batch = 10, flushInterval = 10000, keyNum = 10)(row => Seq(row.getField(0), row.getField(1), row.getField(2), row.getField(3), row.getField(4))) // toRetractStream支持状态更新、删除操作,比例sql中含有group by 等聚合操作,后进来的记录会导致已有的聚合结果不正确 // 使用toRetractStream后会将之前的旧的聚合结果重新发送一次,并且tuple中的flag标记为false,然后再发送一条正确的结果 // 类似于structured streaming中自动维护结果表,并进行update操作 this.tableEnv.asInstanceOf[StreamTableEnvironment].toRetractStream[Row](table).print() } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/stream/FlinkSinkHiveTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.stream import com.zto.fire._ import com.zto.fire.flink.FlinkStreaming /** * 基于fire框架进行Flink SQL开发
* 1. Flink SQL开发官方文档——kafka connector
* 2. Flink SQL开发官方文档——jdbc connector * * @author ChengLong * @since 2.0.0 * @create 2021-01-18 17:24 */ object FlinkSinkHiveTest extends FlinkStreaming { // 具体的业务逻辑放到process方法中 override def process: Unit = { // 消息 """ |{"table":"t_student", "order_time": "2021-07-26 15:58:59.181","before":{"id":1,"age":1,"name":"spark1","length":51.1,"createTime":"2021-06-20 11:31:51"},"after":{"id":1,"age":21,"name":"flink1","length":151.1,"createTime":"2021-06-22 10:31:30"}} |{"table":"t_student", "order_time": "2021-07-27 15:58:59.181","before":{"id":2,"age":2,"name":"spark2","length":52.2,"createTime":"2021-06-20 11:32:52"},"after":{"id":2,"age":22,"name":"flink2","length":152.2,"createTime":"2021-06-23 10:32:30"}} |{"table":"t_student", "order_time": "2021-07-31 15:58:59.181","before":{"id":3,"age":3,"name":"spark3","length":53.3,"createTime":"2021-06-20 11:33:53"},"after":{"id":3,"age":23,"name":"flink3","length":153.3,"createTime":"2021-06-24 10:33:30"}} |{"table":"t_student", "order_time": "2021-07-29 15:58:59.181","before":{"id":4,"age":4,"name":"spark4","length":54.4,"createTime":"2021-07-30 11:34:54"},"after":{"id":4,"age":24,"name":"flink4","length":154.4,"createTime":"2021-07-30 10:34:30"}} |{"table":"t_student", "order_time": "2021-07-30 15:58:59.181","before":{"id":5,"age":5,"name":"spark5","length":55.5,"createTime":"2021-07-29 11:35:55"},"after":{"id":5,"age":25,"name":"flink5","length":155.5,"createTime":"2021-07-29 09:35:30"}} |""".stripMargin sql( """ |CREATE TABLE t_student ( | `table` STRING, | `before` ROW(`id` bigint, `age` int, `name` string, `length` double, `createTime` string), -- 嵌套json的声明方式,使用ROW(),这么写很麻烦,但没办法 | `after` ROW(id bigint, age int, name string, length double, createTime string), | order_time TIMESTAMP(3), | WATERMARK FOR order_time AS order_time - INTERVAL '50' SECOND |) WITH ( | 'connector' = 'kafka', -- 用于指定connector的类型 | 'topic' = 'fire', -- 消费的topic名称为fire | 'scan.startup.mode'='latest-offset', | 'properties.bootstrap.servers' = 'kafka-server:9092', -- kafka的broker地址 | 'properties.group.id' = 'fire2', -- 当前flink sql任务所使用的groupId | 'format' = 'json' -- 指定解析的kafka消息为json格式 |) |""".stripMargin) sql( """ |create view v_student as |select | t.`table` as table_name, | after.id as id, -- 解析ROW类型声明的嵌套字段,直接以点的方式一级一级指定 | after.age as age, | after.name as name, | after.length as length, | order_time as order_time |from t_student t |""".stripMargin) this.tableEnv.useHiveCatalog() println(this.tableEnv.getCurrentCatalog) sql("drop table if exists tmp.flink_hive_sink") sql( """ |CREATE TABLE if not exists tmp.flink_hive_sink ( | id BIGINT, | name STRING, | age INT |) PARTITIONED BY (ds STRING) |STORED AS parquet |TBLPROPERTIES ( | 'partition.time-extractor.kind'='custom', | 'partition.time-extractor.timestamp-pattern'='$ds', -- 与分区字段对应 | 'sink.partition-commit.trigger'='partition-time', -- 分区触发提交 | 'partition.time-extractor.class'='com.zto.fire.flink.util.HivePartitionTimeExtractor', | 'sink.partition-commit.delay'='1 s', -- 提交延迟 | 'sink.partition-commit.policy.kind'='metastore,success-file' -- 提交类型 |) |""".stripMargin) sql( """ |INSERT INTO TABLE hive.tmp.flink_hive_sink SELECT id, name, age, DATE_FORMAT(order_time, 'yyyyMMdd') as ds FROM default_catalog.default_database.v_student |""".stripMargin) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/stream/FlinkSinkTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.stream import com.zto.fire._ import com.zto.fire.common.util.JSONUtils import com.zto.fire.core.anno.connector.Kafka import com.zto.fire.examples.bean.Student import com.zto.fire.flink.FlinkStreaming import org.apache.flink.api.scala._ import org.apache.flink.configuration.Configuration import org.apache.flink.streaming.api.functions.sink.{RichSinkFunction, SinkFunction} /** * 自定义sink的实现 */ @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire", autoCommit = true) // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object FlinkSinkTest extends FlinkStreaming { override def process: Unit = { val dstream = this.fire.createDirectStream().map(json => JSONUtils.parseObject[Student](json)) dstream.map(t => t.getName).addSink(new MySink).setParallelism(1) } } class MySink extends RichSinkFunction[String] { /** * open方法中可以创建数据库连接等初始化操作 * 注:若setParallelism(10)则会执行10次open方法 */ override def open(parameters: Configuration): Unit = { println("=========执行open方法========") } /** * close方法用于释放资源,如数据库连接等 */ override def close(): Unit = { println("=========执行close方法========") } override def invoke(value: String, context: SinkFunction.Context): Unit = { println("---> " + value) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/stream/FlinkSourceTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.stream import com.zto.fire._ import com.zto.fire.core.anno.connector.Hive import com.zto.fire.flink.FlinkStreaming import org.apache.flink.api.scala._ import org.apache.flink.configuration.Configuration import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction} import org.apache.flink.streaming.api.windowing.time.Time /** * 自定义source * @author ChengLong 2020-4-7 14:30:08 */ @Hive("test") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object FlinkSourceTest extends FlinkStreaming { override def process: Unit = { val dstream = this.fire.addSource(new MySource).setParallelism(2) // 注意Time的包不要导错,来自org.apache.flink.streaming.api.windowing.time.Time dstream.timeWindowAll(Time.seconds(2)).sum(0).setParallelism(1).print } } /** * 自定义source组件 * 支持多并行度 */ class MySource extends RichParallelSourceFunction[Long] { private var isRunning = false private var index = 1 /** * open方法中可以创建数据库连接等初始化操作 * 注:若setParallelism(10)则会执行10次open方法 */ override def open(parameters: Configuration): Unit = { this.isRunning = true println("=========执行open方法========") } /** * 持续不断的将消息发送给flink * @param ctx */ override def run(ctx: SourceFunction.SourceContext[Long]): Unit = { while (this.isRunning) { this.index += 1 ctx.collect(this.index) Thread.sleep(1000) } } /** * 当任务被cancel时调用 */ override def cancel(): Unit = { this.isRunning = false println("=========执行cancel方法==========") } /** * close方法用于释放资源,如数据库连接等 */ override def close(): Unit = { println("=========执行close方法==========") } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/stream/FlinkStateTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.stream import com.zto.fire._ import com.zto.fire.flink.FlinkStreaming import org.apache.flink.api.common.functions.{RichAggregateFunction, RichMapFunction} import org.apache.flink.api.common.state.StateTtlConfig import org.apache.flink.api.common.time.Time import org.apache.flink.api.scala._ /** * 用于演示基于FireMapFunction的状态使用 * 本例演示KeyedStream相关的状态使用,也就是说stream是通过keyBy分组过的 * 1. 由于经过keyBy算子进行了分组,因此相同key的算子都会跑到同一个subtask中执行,并行度的改变也就不会影响状态数据的一致性 * 2. 状态在不同的task之间是隔离的,也就是说对同一个keyedStream进行多次map操作,每个map中的状态是不一样的,是隔离开来的 * * @author ChengLong 2021年1月5日09:13:50 * @since 2.0.0 */ object FlinkStateTest extends FlinkStreaming { // 将dstream声明为成员变量时,一定要加lazy关键字,避免env还没初始化导致空指针异常 lazy val dstream = this.fire.createCollectionStream(Seq((1, 1), (1, 2), (1, 3), (1, 6), (1, 9), (2, 1), (2, 2), (3, 1))).keyBy(0) /** * 一、基于FireMapFunction演示ValueState、ListState、MapState的使用 */ private def testSimpleState: Unit = { this.dstream.map(new RichMapFunction[(Int, Int), Int]() { // 定义状态的ttl时间,如果不在open方法中定义 lazy val ttlConfig = StateTtlConfig.newBuilder(Time.days(1)).build() // 如果状态放到成员变量中声明,则需加lazy关键字 lazy val listState = this.getListState[Int]("list_state") override def map(value: (Int, Int)): Int = { // FireMapFunction中提供的API,通过名称获取对应的状态实例,该API具有缓存的特性 // 因此不需要放到open或声明为成员变量,每次直接通过this.getXxxState即可获取同一实例 // 第一个参数是状态实例名称,不可重复,ttlConfig参数如果不指定,则默认不启用ttl,生产环境强烈建议开启 // 1. ValueState与KeyedStream中的每个key是一一对应的 val valueState = this.getState[Int]("value_state", ttlConfig) valueState.update(value._2 + valueState.value()) logger.warn(s"key=${value._1} 状态结果:" + valueState.value()) Thread.sleep(10000) // 2. 获取ListState,该状态的特点是KeyedStream中的每个key都单独对应一个List集合 listState.add(value._2) listState.add(value._2 + 1) // 3. 获取mapState,该状态的特点是KeyedStream中的每个key都单独对应一个Map集合 val mapState = this.getMapState[Int, Int]("map_state", ttlConfig) mapState.put(value._1, value._2) mapState.put(value._1 + 1, value._2) value._2 } }).uname(uid = "simpleState", name = "状态累加") // 通过uname进行uid与name的指定 } /** * 二、基于FireMapFunction演示AggregatingState、getReducingState的使用 */ private def testFunctionState: Unit = { this.dstream.map(new RichMapFunction[(Int, Int), Int]() { // 1. ReducingState状态演示,将Int类型数据保存到状态中 // 该ReduceFunction中定义的逻辑是将当前状态中的值与传入的新值进行累加,然后重新update到状态中 // 方法的第二个参数是reduce的具体逻辑,本示例演示的是累加 lazy val reduceState = this.getReducingState[Int]("reduce_state", (a: Int, b: Int) => a + b) // 2. AggregatingState状态使用,将Int类型数据保存到状态中 // 需要创建AggregateFunction,泛型意义依次为:输入数据类型、累加器类型,聚合结果类型 lazy val aggrState = this.getAggregatingState[(Int, Int), Int, Int]("aggr_state", this.newRichAggregateFunction) override def map(value: (Int, Int)): Int = { // 1. reduceState状态使用 this.reduceState.add(value._2) logger.warn(s"reduceState当前结果:key=${value._1} state=${this.reduceState.get()}") // 2. AggregatingState状态使用 this.aggrState.add(value) this.aggrState.get() logger.warn(s"aggrState当前结果:key=${value._1} state=${this.aggrState.get()}") value._2 } /** * 创建一个RichAggregateFunction的子类 * 在该子类中构建AggregateFunction对象,并定义好聚合的逻辑 * 定义将输入数据与状态中的数据进行累加 */ def newRichAggregateFunction: RichAggregateFunction[(Int, Int), Int, Int] = { new RichAggregateFunction[(Int, Int), Int, Int]() { /** 迭代状态的初始值 */ override def createAccumulator(): Int = 0 /** 每一条输入数据,和迭代数据如何迭代 */ override def add(value: (Int, Int), accumulator: Int): Int = value._2 + accumulator /** 返回数据,对最终的迭代数据如何处理,并返回结果 */ override def getResult(accumulator: Int): Int = accumulator /** 多个分区的迭代数据如何合并 */ override def merge(a: Int, b: Int): Int = a + b } } }).uname("testFunctionState") } /** * 三、演示mapWithState的使用 */ def testWithState: Unit = { // [String, Int]分表表示map后类型与状态的类型 // 每个case中返回值中Some(xxx)中的xxx就是下一次同样key的数据进来以后状态获取到的数据 // 也就是自动将Some(xxx)中的xxx数据update到ValueState中 // 本例是将上一次的状态与当前进入的值进行累加,更新到状态中 this.dstream.mapWithState[String, Int]({ // 当第一次进入,状态中没有值时,给当前value case (value: (Int, Int), None) => { logger.warn(s"状态为空:当前key=${value._1} value=${value._2}") (value._1.toString, Some(value._2)) } // 后续进入,状态中有值时,则累加当前进入的数据到状态中 case (value: (Int, Int), state: Some[Int]) => { // 从state中get到的数据是上一次同一个key的sum值,因此通过state.get的值总是滞后于sum的 val sum = value._2 + state.get logger.warn(s"当前key=${value._1} value=${value._2} state=${state.get} sum=$sum") (value._1.toString, Some(sum)) } }).uid("flatMapWithState").name("计算状态") } /** * 业务逻辑处理,该方法会被fire自动调用,可避免main方法中代码过于臃肿 */ override def process: Unit = { this.fire.setParallelism(3) // 演示ValueState、ListState、MapState的使用 this.testSimpleState // 演示AggregatingState、getReducingState的使用 // this.testFunctionState // 演示mapWithState的使用 // this.testWithState } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/stream/HBaseTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.stream import com.zto.fire._ import com.zto.fire.common.util.JSONUtils import com.zto.fire.core.anno.connector.{HBase, HBase2, HBase3, Kafka} import com.zto.fire.examples.bean.Student import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Checkpoint import com.zto.fire.hbase.HBaseConnector import org.apache.flink.api.scala._ import org.apache.flink.streaming.api.scala.DataStream import scala.collection.mutable.ListBuffer /** * flink hbase sink * * @author ChengLong * @since 1.1.0 * @create 2020-5-25 16:32:50 */ @Checkpoint(30) @HBase("test") @HBase2("test") // 对应keyNum=2的Hbase集群地址 @HBase3("test") // 对应keyNum=3的Hbase集群地址 @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object HBaseTest extends FlinkStreaming { lazy val tableName = "fire_test_1" lazy val tableName2 = "fire_test_2" lazy val tableName3 = "fire_test_3" lazy val tableName5 = "fire_test_5" lazy val tableName6 = "fire_test_6" lazy val tableName7 = "fire_test_7" lazy val tableName8 = "fire_test_8" lazy val tableName9 = "fire_test_9" lazy val tableName10 = "fire_test_10" lazy val tableName11 = "fire_test_11" lazy val tableName12 = "fire_test_12" /** * table的hbase sink */ def testTableHBaseSink(stream: DataStream[Student]): Unit = { stream.createOrReplaceTempView("student") val table = this.flink.sqlQuery("select id, name, age from student group by id, name, age") // 方式一、自动将row转为对应的JavaBean // 注意:table对象上调用hbase api,需要指定泛型 table.hbasePutTable[Student](this.tableName).setParallelism(1) this.fire.hbasePutTable[Student](table, this.tableName2, keyNum = 2) // 方式二、用户自定义取数规则,从row中创建HBaseBaseBean的子类 table.hbasePutTable2[Student](this.tableName3)(row => new Student(1L, row.getField(1).toString, row.getField(2).toString.toInt)) // 或者 this.fire.hbasePutTable2[Student](table, this.tableName5, keyNum = 2)(row => new Student(1L, row.getField(1).toString, row.getField(2).toString.toInt)) } /** * table的hbase sink */ def testTableHBaseSink2(stream: DataStream[Student]): Unit = { val table = this.fire.sqlQuery("select id, name, age from student group by id, name, age") // 方式二、用户自定义取数规则,从row中创建HBaseBaseBean的子类 table.hbasePutTable2(this.tableName6)(row => new Student(1L, row.getField(1).toString, row.getField(2).toString.toInt)) // 或者 this.flink.hbasePutTable2(table, this.tableName7, keyNum = 2)(row => new Student(1L, row.getField(1).toString, row.getField(2).toString.toInt)) } /** * stream hbase sink */ def testStreamHBaseSink(stream: DataStream[Student]): Unit = { // 方式一、DataStream中的数据类型为HBaseBaseBean的子类 // stream.hbasePutDS(this.tableName) this.fire.hbasePutDS[Student](stream, this.tableName8) // 方式二、将value组装为HBaseBaseBean的子类,逻辑用户自定义 stream.hbasePutDS2(this.tableName9, keyNum = 2)(value => value) // 或者 this.fire.hbasePutDS2(stream, this.tableName10)(value => value) } /** * stream hbase sink */ def testStreamHBaseSink2(stream: DataStream[Student]): Unit = { // 方式二、将value组装为HBaseBaseBean的子类,逻辑用户自定义 stream.hbasePutDS2(this.tableName11)(value => value) // 或者 this.fire.hbasePutDS2(stream, this.tableName12, keyNum = 2)(value => value) } /** * hbase的基本操作 */ def testHBase: Unit = { // get操作 val getList = ListBuffer(HBaseConnector.buildGet("1")) val student = HBaseConnector.get(this.tableName, classOf[Student], getList, 1) if (student != null) println(JSONUtils.toJSONString(student)) // scan操作 val studentList = HBaseConnector.scan(this.tableName, classOf[Student], HBaseConnector.buildScan("0", "9"), 1) if (studentList != null) println(JSONUtils.toJSONString(studentList)) // delete操作 HBaseConnector.deleteRows(this.tableName, Seq("1")) } override def process: Unit = { val stream = this.fire.createKafkaDirectStream().filter(t => JSONUtils.isLegal(t)).map(json => JSONUtils.parseObject[Student](json)).setParallelism(1) HBaseConnector.truncateTable(this.tableName) HBaseConnector.truncateTable(this.tableName2) HBaseConnector.truncateTable(this.tableName3) HBaseConnector.truncateTable(this.tableName5) this.testTableHBaseSink(stream) this.testStreamHBaseSink(stream) this.testStreamHBaseSink2(stream) this.testTableHBaseSink2(stream) this.testHBase } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/stream/HiveRW.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.stream import com.zto.fire._ import com.zto.fire.core.anno.connector.{Hive, Kafka} import com.zto.fire.flink.FlinkStreaming /** * 基于Fire进行Flink Streaming开发 */ @Hive("test") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire", autoCommit = true) // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object HiveRW extends FlinkStreaming { /** * 业务逻辑代码,会被fire自动调用 */ override def process: Unit = { this.fire.useHiveCatalog() this.ddl sql( """ |insert into table tmp.baseorganize_fire select * from dim.baseorganize limit 10 |""".stripMargin) sql( """ |select * from tmp.baseorganize_fire |""".stripMargin).print() } /** * 创建表 */ def ddl: Unit = { sql( """ |drop table if exists tmp.baseorganize_fire |""".stripMargin) sql( """ |create table tmp.baseorganize_fire ( | id bigint, | name string, | age int |) partitioned by (ds string) |row format delimited fields terminated by '/t' |""".stripMargin) } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/stream/JdbcTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.stream import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.common.util.{DateFormatUtils, JSONUtils, PropUtils} import com.zto.fire.core.anno.connector.Kafka import com.zto.fire.examples.bean.Student import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.util.FlinkUtils import org.apache.flink.api.scala._ import org.apache.flink.streaming.api.scala.DataStream /** * flink jdbc sink * * @author ChengLong * @since 1.1.0 * @create 2020-05-22 11:10 */ // 1. 以代码的方式进行配置,支持不单独定义配置文件,如果同时定义了配置文件,则配置文件优先级更高 @Config( """ |######################################################################################### |# JDBC数据源配置信息详见:common.properties,公共数据源配置可放到common.properties中,便于维护 # |######################################################################################### | |# flink所支持的参数 |state.checkpoints.num-retained = 3 |state.backend.incremental = true |state.backend.rocksdb.files.open = 5000 | |hello.world = 2020 |hello.world.flag = false |hello.world.flag2 = false |""") // 2. 指定从test.properties加载配置文件 // @Config(Array("test.properties")) // 3. 指定从以下两个配置文件中加载配置信息 // @Config(Array("test.properties", "test2.properties")) @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire", autoCommit = true) // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object JdbcTest extends FlinkStreaming { lazy val tableName = "spark_test" lazy val tableName2 = "spark_test2" val fields = "name, age, createTime, length, sex".split(",") def sql(tableName: String): String = s"INSERT INTO $tableName (${fields.mkString(",")}) VALUES (?, ?, ?, ?, ?)" /** * table的jdbc sink */ def testTableJdbcSink(stream: DataStream[Student]): Unit = { stream.createOrReplaceTempView("student") val table = this.fire.sqlQuery("select name, age, createTime, length, sex from student group by name, age, createTime, length, sex") // 方式一、table中的列顺序和类型需与jdbc sql中的占位符顺序保持一致 table.jdbcBatchUpdate(sql(this.tableName)).setParallelism(1) // 或者 this.fire.jdbcBatchUpdateTable(table, sql(this.tableName2)).setParallelism(1) // 方式二、自定义row取数规则,适用于row中的列个数和顺序与sql占位符不一致的情况 table.jdbcBatchUpdate2(sql(this.tableName), flushInterval = 10000, keyNum = 2)(row => { Seq(row.getField(0), row.getField(1), row.getField(2), row.getField(3), row.getField(4)) }) // 或者 this.fire.jdbcBatchUpdateTable2(table, sql(this.tableName2), keyNum = 2)(row => { Seq(row.getField(0), row.getField(1), row.getField(2), row.getField(3), row.getField(4)) }).setParallelism(1) } /** * stream jdbc sink */ def testStreamJdbcSink(stream: DataStream[Student]): Unit = { // 方式一、指定字段列表,内部根据反射,自动获取DataStream中的数据并填充到sql中的占位符 // 此处fields有两层含义:1. sql中的字段顺序(对应表) 2. DataStream中的JavaBean字段数据(对应JavaBean) // 注:要保证DataStream中字段名称是JavaBean的名称,非表中字段名称 顺序要与占位符顺序一致,个数也要一致 stream.jdbcBatchUpdate(sql(this.tableName), fields, keyNum = 6).setParallelism(3) // 或者 this.fire.jdbcBatchUpdateStream(stream, sql(this.tableName2), fields, keyNum = 6).setParallelism(1) // 方式二、通过用户指定的匿名函数方式进行数据的组装,适用于上面方法无法反射获取值的情况,适用面更广 stream.jdbcBatchUpdate2(sql(this.tableName), 3, 30000, keyNum = 7) { // 在此处指定取数逻辑,定义如何将dstream中每列数据映射到sql中的占位符 value => Seq(value.getName, value.getAge, DateFormatUtils.formatCurrentDateTime(), value.getLength, value.getSex) }.setParallelism(1) // 或者 this.fire.jdbcBatchUpdateStream2(stream, sql(this.tableName2), keyNum = 7) { value => Seq(value.getName, value.getAge, DateFormatUtils.formatCurrentDateTime(), value.getLength, value.getSex) }.setParallelism(2) } def testJdbc: Unit = { // 执行查询操作 val studentList = this.fire.jdbcQueryList(s"select * from $tableName", clazz = classOf[Student]) val dataStream = this.env.fromCollection(studentList) dataStream.toTable.createOrReplaceTempView("test") this.fire.sql( """ |select * from test |""".stripMargin) dataStream.print() // 执行增删改操作 this.fire.jdbcUpdate(s"delete from $tableName") } /** * 用于测试分布式配置 */ def logConf: Unit = { println(s"isJobManager=${FlinkUtils.isJobManager} isTaskManager=${FlinkUtils.isTaskManager} hello.world=" + PropUtils.getString("hello.world", "not_found")) println(s"isJobManager=${FlinkUtils.isJobManager} isTaskManager=${FlinkUtils.isTaskManager} flink.hello=" + PropUtils.getString("flink.hello", "not_found")) println(s"isJobManager=${FlinkUtils.isJobManager} isTaskManager=${FlinkUtils.isTaskManager} flink.world=" + PropUtils.getString("flink.world", "not_found")) println(s"isJobManager=${FlinkUtils.isJobManager} isTaskManager=${FlinkUtils.isTaskManager} hello.world.flag=" + PropUtils.getBoolean("hello.world.flag", false)) println(s"isJobManager=${FlinkUtils.isJobManager} isTaskManager=${FlinkUtils.isTaskManager} hello.world.flag2=" + PropUtils.getBoolean("hello.world.flag", false, keyNum = 2)) } override def process: Unit = { this.logConf val stream = this.fire.createKafkaDirectStream().filter(t => JSONUtils.isLegal(t)).map(json => { this.logConf JSONUtils.parseObject[Student](json) }) this.testTableJdbcSink(stream) this.testStreamJdbcSink(stream) // this.testJdbc } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/stream/UDFTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.stream import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.common.util.JSONUtils import com.zto.fire.core.anno.connector.Kafka import com.zto.fire.examples.bean.Student import com.zto.fire.flink.FlinkStreaming import org.apache.flink.api.scala._ import org.apache.flink.table.functions.ScalarFunction /** * 自定义udf测试 * * @author ChengLong 2020年1月13日 10:36:39 * @since 0.4.1 */ @Config( """ |# 开启fire udf注册功能(默认为关闭) |flink.sql.udf.fireUdf.enable=true |# 指定udf jar包的本地路径 |flink.sql.conf.pipeline.jars=file:///home/spark3/flink/udf.jar |# 指定udf函数名为appendFire,对应的udf实现类为com.zto.fire.examples.flink.stream.Udf |flink.sql.udf.conf.appendFire=com.zto.fire.examples.flink.stream.Udf |flink.sql.udf.conf.fire=com.zto.fire.examples.flink.stream.Udf |""") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire", autoCommit = true) // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object UDFTest extends FlinkStreaming { override def process: Unit = { val stream = this.fire.createKafkaDirectStream() .map(JSONUtils.parseObject[Student](_)).setParallelism(3) stream.createOrReplaceTempView("test") // 在sql中使用自定义的udf this.flink.sql("select appendFire(name), fire(age) from test").print() } } class Udf extends ScalarFunction { /** * 为指定字段的值追加fire字符串 * * @param field * 字段名称 * @return * 追加fire字符串后的字符串 */ def eval(field: String): String = field + "->fire" /** * 支持函数的重载,会自动判断输入字段的类型调用相应的函数 */ def eval(field: JInt): String = field + "-> Int fire" } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/stream/WatermarkTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.stream import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.common.util.{DateFormatUtils, JSONUtils} import com.zto.fire.core.anno.connector.{Hive, Kafka} import com.zto.fire.examples.bean.Student import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.ext.watermark.FirePeriodicWatermarks import org.apache.commons.lang3.StringUtils import org.apache.flink.api.scala._ import org.apache.flink.streaming.api.scala.OutputTag import org.apache.flink.streaming.api.scala.function.WindowFunction import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows import org.apache.flink.streaming.api.windowing.time.Time import org.apache.flink.streaming.api.windowing.windows.TimeWindow import org.apache.flink.util.Collector import java.text.SimpleDateFormat /** * 水位线的使用要求: * 1. 开启EventTime:flink.stream.time.characteristic = EventTime * 2. 不同的task中有多个水位线实例,本地测试为了尽快看到效果,要降低并行度 * 3. 多个task中的水位线会取最早的 * 4. 水位线触发条件:1)多个task中时间最早的水位线时间 >= window窗口end时间 2)窗口中有数据 * 5. 水位线是为了解决乱序和延迟数据的问题 * 6. 乱序数据超过水位线的三种处理方式:1. 丢弃(默认) 2. allowedLateness,相当于进一步宽容的时间 3. sideOutputLateData:将延迟数据收集起来,统一处理 * * @author ChengLong 2020-4-13 15:58:38 */ @Config( """ |flink.stream.time.characteristic = EventTime |flink.default.parallelism = 2 |""") @Hive("test") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire", autoCommit = true) // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object WatermarkTest extends FlinkStreaming { override def process: Unit = { // source端接入消息并解析 val dstream = this.fire.createKafkaDirectStream().filter(str => StringUtils.isNotBlank(str) && str.contains("}")).map(str => { val student = JSONUtils.parseObject[Student](str) (student, DateFormatUtils.formatDateTime(student.getCreateTime).getTime) }) // 分配并计算水位线,默认允许最大的乱序时间为10s,若需指定,则通过构造方法传参new FirePeriodicWatermarks(100) val watermarkDS = dstream.assignTimestampsAndWatermarks(new FirePeriodicWatermarks[(Student, Long)]() { val format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS") /** * 抽取eventtime字段 */ override def extractTimestamp(element: (Student, Long), previousElementTimestamp: Long): Long = { println("---> 抽取eventtime:" + element._2 + " 最新水位线值:" + this.watermark.getTimestamp) element._2 } }).setParallelism(1) // 并行度调整为1的好处是能尽快观察到水位线的效果,否则要等多个task满足条件,不易观察结果 val windowDStream = watermarkDS .keyBy(_._1) .window(TumblingEventTimeWindows.of(Time.seconds(3))) // 最大允许延迟的数据3s,算上水位线允许最大的乱序时间10s,一共允许最大的延迟时间为13s .allowedLateness(Time.seconds(3)) // 收集延期的数据 .sideOutputLateData(this.outputTag.asInstanceOf[OutputTag[(Student, Long)]]) .apply(new WindowFunctionTest) windowDStream.print().setParallelism(1) // 获取由于延迟太久而被丢弃的数据 windowDStream.getSideOutput[(Student, Long)](this.outputTag.asInstanceOf[OutputTag[(Student, Long)]]).map(t => ("丢弃", t)).print() } /** * 泛型说明: * 1. IN: The type of the input value. * 2. OUT: The type of the output value. * 3. KEY: The type of the key. */ class WindowFunctionTest extends WindowFunction[(Student, Long), (Student, Long), Student, TimeWindow] { override def apply(key: Student, window: TimeWindow, input: Iterable[(Student, Long)], out: Collector[(Student, Long)]): Unit = { println("-->" + JSONUtils.toJSONString(key)) val sortedList = input.toList.sortBy(_._2) sortedList.foreach(t => { println("---> " + JSONUtils.toJSONString(t._1)) out.collect(t) }) } } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/stream/WindowTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.stream import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.common.util.JSONUtils import com.zto.fire.core.anno.connector.Kafka import com.zto.fire.examples.bean.Student import com.zto.fire.flink.FlinkStreaming import org.apache.flink.api.scala._ import org.apache.flink.streaming.api.TimeCharacteristic import org.apache.flink.streaming.api.scala.DataStream import org.apache.flink.streaming.api.windowing.time.Time /** * window相当于将源源不断的流按一定的规则切分成有界流,然后为每个有界流分别计算 * 当程序挂掉重启后,window中的数据不会丢失,会接着之前的window继续计算 * 注:不建议使用windowAll,该api会将数据发送到同一个分区,造成严重的性能问题 * * @author ChengLong 2020-4-18 14:34:58 */ @Config( """ |flink.fire.rest.filter.enable = false |flink.default.parallelism = 8 |flink.max.parallelism = 8 |""") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire", autoCommit = true) // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object WindowTest extends FlinkStreaming { override def process: Unit = { val dstream = this.fire.createKafkaDirectStream().map(t => JSONUtils.parseObject[Student](t)).map(s => (s.getName, s.getAge)) this.testTimeWindow(dstream) } /** * 如果是keyedStream,则窗口函数为countWindow */ private def testCountWindow(dstream: DataStream[(String, Integer)]): Unit = { dstream.keyBy(_._1) // 第一个参数表示窗口大小,窗口的容量是2条记录,达到2条会满,作为一个单独的window实例 // 第二个参数如果不指定,则表示为滚动窗口(没有重叠),如果指定则为滑动窗口(有重叠) // 以下表示每隔1条数据统计一次window数据,而这个window中包含2条记录 .countWindow(2, 1) .sum(1).print() } /** * 如果是普通的Stream,则窗口函数为countWindowAll */ def testCountWindowAll(dstream: DataStream[(String, Integer)]): Unit = { // 表示每2条计算一次,每次将计算好的两条记录结果打印 dstream.countWindowAll(2).sum(1).print() } /** * 时间窗口 */ def testTimeWindow(dstream: DataStream[(String, Integer)]): Unit = { // 窗口的宽度为1s,每隔1s钟处理过去1s的数据,这1s的时间内窗口中的记录数可多可少 dstream.timeWindowAll(Time.seconds(1)).sum(1).print() // 创建一个基于process时间(支持event时间)的滑动窗口,窗口大小为10秒,每隔5秒创建一个 dstream.keyBy(_._1).slidingTimeWindow(Time.seconds(10), Time.seconds(5), timeCharacteristic = TimeCharacteristic.ProcessingTime).sum(1).printToErr() // 创建一个滚动窗口 dstream.keyBy(_._1).tumblingTimeWindow(Time.seconds(10)).sum(1).print() // 创建一个session会话窗口,当5秒内没有消息进入,则单独划分一个窗口 dstream.keyBy(_._1).sessionTimeWindow(Time.seconds(5)).sum(1).printToErr() } } ================================================ FILE: fire-examples/flink-examples/src/main/scala/com/zto/fire/examples/flink/util/StateCleaner.scala ================================================ package com.zto.fire.examples.flink.util import com.zto.fire.flink.conf.FireFlinkConf import com.zto.fire.flink.util.StateCleanerUtils import org.apache.commons.lang3.time.DateUtils import org.apache.hadoop.fs.LocatedFileStatus import com.zto.fire._ import java.util.Date import scala.collection.mutable.ListBuffer /** * flink历史失效状态清理工具 * 清理策略: * conservativeModel:筛选出不再使用的checkpoint文件,将这些文件归档至指定的目录中,并定期删除指定时间的数据 * 直接删除模式:直接删除不再需要的checkpoint文件 * * @author ChengLong 2021-9-6 15:06:21 * @contact Fire框架技术交流群(钉钉):35373471 */ object StateCleaner extends StateCleanerUtils { // ------------------------------ hdfs 选项 ----------------------------------- // override protected val hdfs = FireFlinkConf.stateHdfsUrl override protected val hdfsUser = "hadoop" // ------------------------------ checkpoint 选项 ------------------------------ // override protected val checkpointDir = "/user/flink/checkpoint" override protected val localCheckpointBaseDir = "./home/checkpoint" override protected val archiveDir = "/user/flink/archive" // 用于存放当前线上flink任务需要使用到的状态绝对路径 override protected val inuserSet = new JHashSet[String]() // download到本地的metadata文件是否采用覆盖的方式避免本地磁盘存放过多的文件 override protected val overwrite = true // 是否将失效的状态文件移到到回收站,等待后续清理 override protected val conservativeModel = true // 用于存放遍历的checkpoint文件,避免二次遍历导致漏分析的文件被标记为删除 override protected val files = ListBuffer[LocatedFileStatus]() // checkpoint元数据的过期时间,AccessTime超过该时间的将会被清理 override protected val checkpointTTL = 60 // 计算出checkpointTtl对应的unix时间戳 override protected val checkpointTTLStamp = DateUtils.addDays(new Date, -this.checkpointTTL).getTime // 是否删除空文件夹 override protected val deleteEmptyDirEnabled = true // ture表示使用访问时间,false表示使用修改时间 override protected val useAccessTime = false // ------------------------------ checkpoint归档选项 ---------------------------- // // 默认清理多少天之前的归档checkpoint文件 override protected val archiveTTL = 30 override protected val archiveTTLStamp = DateUtils.addDays(new Date, -this.archiveTTL).getTime // 用于指定是否删除过期的checkpoint归档文件 override protected val deleteArchiveEnabled = true // ------------------------------ savepoint 选项 ------------------------------- // override protected val savepointDir = "/user/flink/savepoint" // savepoint的ttl时间 override protected val savepointTTL = 10 override protected val savepointTTLStamp = DateUtils.addDays(new Date, -this.savepointTTL).getTime // 用于指定是否清理过期savepoint override protected val deleteSavepointEnabled = true // ------------------------------ savepoint 选项 ------------------------------- // override protected val completedDir = "/user/flink/completed-jobs" override protected val completedTTL = 31 override protected val completedTTLStamp = DateUtils.addDays(new Date, -this.completedTTL).getTime override protected val deleteCompleteJobEnable = true def main(args: Array[String]): Unit = { this.run() } } ================================================ FILE: fire-examples/flink-examples/src/test/scala/com/zto/fire/examples/flink/anno/AnnoConfTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.anno import com.zto.fire.common.anno.{Config, TestStep} import com.zto.fire.common.conf.{FireFrameworkConf, FireHiveConf, FireKafkaConf, FireRocketMQConf} import com.zto.fire.common.util.PropUtils import com.zto.fire.core.anno._ import com.zto.fire.core.anno.connector.{HBase, HBase2, HBase3, Hive, Jdbc, Jdbc2, Jdbc3, Kafka, Kafka2, Kafka3, RocketMQ, RocketMQ2} import com.zto.fire.examples.flink.core.BaseFlinkTester import com.zto.fire.flink.FlinkStreaming import com.zto.fire.flink.anno.Checkpoint import com.zto.fire.flink.conf.FireFlinkConf import com.zto.fire.hbase.conf.FireHBaseConf import com.zto.fire.jdbc.conf.FireJdbcConf import org.junit.Test /** * 基于Fire注解进行任务参数设置 */ @Config( """ |hive.cluster=test |flink.max.parallelism=11 |""") @Checkpoint(interval = 100, unaligned = false, timeout = 10, concurrent = 2, pauseBetween = 30, failureNumber = 10) @Hive(value = "batch", catalog = "hive_catalog", version = "1.1.1", partition = "dt") @HBase(value = "batch-new1", batchSize = 10, durability = "off", scanPartitions = 12, config = Array("hbase.zookeeper.property.clientPort=2181", "zookeeper.znode.parent = /hbase")) @HBase2(value = "batch-new2", tableMetaCache = false, batchSize = 10, storageLevel = "memory_only", config = Array("hbase.zookeeper.property.clientPort=2182", "zookeeper.znode.parent = /hbase2")) @HBase3(value = "batch-new3", scanPartitions = 11, family = "data", maxRetries = 5, config = Array("hbase.zookeeper.property.clientPort=2183", "zookeeper.znode.parent = /hbase3")) @Kafka(brokers = "localhost:2181", topics = "fire", groupId = "fire", startingOffset = "start", endingOffsets = "end", autoCommit = true, sessionTimeout = 10, requestTimeout = 11, pollInterval = 12, forceOverwriteStateOffset = true, forceAutoCommit = true, forceAutoCommitInterval = 10) @Kafka2(brokers = "127.0.0.1:2181", topics = "fire2", groupId = "fire2", startingOffset = "start2", endingOffsets = "end2", sessionTimeout = 100, requestTimeout = 110, pollInterval = 120) @Kafka3(brokers = "127.0.0.1:2181", topics = "fire3", groupId = "fire3", startFromTimestamp = 100, startFromGroupOffsets = true, config = Array[String]("hello=world", "scala=flink")) @RocketMQ(brokers = "rocketmq", topics = "fire", groupId = "fire", startingOffset = "new", tag = "a", autoCommit = true, config = Array[String]("hello=world", "scala=flink")) @RocketMQ2(brokers = "rocketmq2", topics = "fire2", groupId = "fire2", startingOffset = "new2", tag = "b", autoCommit = true, config = Array[String]("hello=world2", "scala=flink2")) @Jdbc(url = "jdbc:mysql://localhost:3306", username = "root1", password = "root1", maxPoolSize = 10, maxIdleTime = 10, batchSize = 51, flushInterval = 1000, logSqlLength = 20, storageLevel = "memory", queryPartitions = 12) @Jdbc2(url = "jdbc:mysql://192.168.0.1:3306", driver = "com.fire", username = "root2", minPoolSize = 9, initialPoolSize = 8, password = "root2", maxRetries = 6, config = Array[String]("hello=world", "scala=flink")) @Jdbc3(url = "jdbc:mysql://192.168.0.2:3306", username = "root3", isolationLevel = "read", password = "root3", acquireIncrement = 2) class AnnoConfTest extends FlinkStreaming with BaseFlinkTester { @Test @TestStep(step = 1, desc = "测试@Jdbc注解") def testJdbc: Unit = { assert(FireJdbcConf.url().equals("jdbc:mysql://localhost:3306")) assert(FireJdbcConf.url(2).equals("jdbc:mysql://192.168.0.1:3306")) assert(FireJdbcConf.url(3).equals("jdbc:mysql://192.168.0.2:3306")) assert(FireJdbcConf.driverClass().equals("com.mysql.jdbc.Driver")) assert(FireJdbcConf.driverClass(2).equals("com.fire")) assert(FireJdbcConf.user().equals("root1")) assert(FireJdbcConf.user(2).equals("root2")) assert(FireJdbcConf.user(3).equals("root3")) assert(FireJdbcConf.password().equals("root1")) assert(FireJdbcConf.password(2).equals("root2")) assert(FireJdbcConf.password(3).equals("root3")) assert(FireJdbcConf.maxPoolSize() == 10) assert(FireJdbcConf.initialPoolSize(2) == 8) assert(FireJdbcConf.isolationLevel(3).equals("read")) assert(FireJdbcConf.maxIdleTime() == 10) assert(FireJdbcConf.maxRetry(2) == 6) assert(FireJdbcConf.acquireIncrement(3) == 2) assert(FireJdbcConf.batchSize() == 51) assert(FireFrameworkConf.logSqlLength == 20) assert(FireJdbcConf.jdbcStorageLevel.equals("MEMORY")) assert(FireJdbcConf.jdbcFlushInterval() == 1000) assert(FireJdbcConf.jdbcQueryPartition == 12) // "hello=world", "scala=flink" PropUtils.sliceKeysByNum(FireJdbcConf.JDBC_C3P0_CONF_PREFIX, 2).foreach(kv => { if (kv._1.equals("hello")) assert(kv._2.equals("world")) if (kv._1.equals("scala")) assert(kv._2.equals("flink")) }) } /** * 测试@RocketMQ注解 */ @Test @TestStep(step = 2, desc = "测试@RocketMQ注解") def testRocketMQ: Unit = { assert(FireRocketMQConf.rocketNameServer().equals("rocketmq")) assert(FireRocketMQConf.rocketTopics().equals("fire")) assert(FireRocketMQConf.rocketGroupId().equals("fire")) assert(FireRocketMQConf.rocketStartingOffset().equals("new")) assert(FireRocketMQConf.rocketConsumerTag().equals("a")) assert(FireRocketMQConf.rocketEnableAutoCommit()) // "hello=world", "scala=flink" PropUtils.sliceKeysByNum(FireRocketMQConf.rocketConfStart, 1).foreach(kv => { if (kv._1.equals("hello")) assert(kv._2.equals("world")) if (kv._1.equals("scala")) assert(kv._2.equals("flink")) }) assert(FireRocketMQConf.rocketNameServer(2).equals("rocketmq2")) assert(FireRocketMQConf.rocketTopics(2).equals("fire2")) assert(FireRocketMQConf.rocketGroupId(2).equals("fire2")) assert(FireRocketMQConf.rocketStartingOffset(2).equals("new2")) assert(FireRocketMQConf.rocketConsumerTag(2).equals("b")) assert(FireRocketMQConf.rocketEnableAutoCommit(2)) // "hello=world", "scala=flink" PropUtils.sliceKeysByNum(FireRocketMQConf.rocketConfStart, 2).foreach(kv => { if (kv._1.equals("hello")) assert(kv._2.equals("world2")) if (kv._1.equals("scala")) assert(kv._2.equals("flink2")) }) } /** * 测试@Kafka注解 */ @Test @TestStep(step = 3, desc = "测试@Kafka注解") def testKafka: Unit = { assert(FireKafkaConf.kafkaBrokers().equals("localhost:2181")) assert(FireKafkaConf.kafkaTopics().equals("fire")) assert(FireKafkaConf.kafkaGroupId().equals("fire")) assert(FireKafkaConf.kafkaStartingOffset().equals("start")) assert(FireKafkaConf.kafkaEndingOffsets().equals("end")) assert(FireKafkaConf.kafkaEnableAutoCommit()) assert(FireKafkaConf.kafkaSessionTimeOut() == 10) assert(FireKafkaConf.kafkaRequestTimeOut() == 11) assert(FireKafkaConf.kafkaPollInterval() == 12) assert(FireKafkaConf.kafkaForceOverwriteStateOffset) assert(FireKafkaConf.kafkaForceCommit) assert(FireKafkaConf.kafkaForceCommitInterval == 10) assert(FireKafkaConf.kafkaBrokers(2).equals("127.0.0.1:2181")) assert(FireKafkaConf.kafkaTopics(2).equals("fire2")) assert(FireKafkaConf.kafkaGroupId(2).equals("fire2")) assert(FireKafkaConf.kafkaStartingOffset(2).equals("start2")) assert(FireKafkaConf.kafkaEndingOffsets(2).equals("end2")) assert(FireKafkaConf.kafkaSessionTimeOut(2) == 100) assert(FireKafkaConf.kafkaRequestTimeOut(2) == 110) assert(FireKafkaConf.kafkaPollInterval(2) == 120) assert(FireKafkaConf.kafkaStartFromTimeStamp(3) == 100) assert(FireKafkaConf.kafkaStartFromGroupOffsets(3)) // "hello=world", "scala=flink" PropUtils.sliceKeysByNum(FireKafkaConf.kafkaConfStart, 3).foreach(kv => { if (kv._1.equals("hello")) assert(kv._2.equals("world")) if (kv._1.equals("scala")) assert(kv._2.equals("flink")) }) } /** * 测试@Checkpoint注解 */ @Test @TestStep(step = 4, desc = "测试@Checkpoint注解") def testCheckpoint: Unit = { assert(FireFlinkConf.streamCheckpointInterval == 100) assert(!FireFlinkConf.unalignedCheckpointEnable) assert(FireFlinkConf.streamCheckpointTimeout == 10) assert(FireFlinkConf.streamCheckpointMaxConcurrent == 2) assert(FireFlinkConf.streamCheckpointMinPauseBetween == 30) assert(FireFlinkConf.streamCheckpointTolerableFailureNumber == 10) } @Test @TestStep(step = 5, desc = "测试@Config注解") def testConfig: Unit = { assert(this.conf.getInt("flink.max.parallelism", 10240) == 11) } @Test @TestStep(step = 1, desc = "hive 注解断言") def testHive: Unit = { // @Hive注解优先级低于@Config assert(FireHiveConf.hiveCluster.equals("batch")) assert(FireHiveConf.hiveVersion.equals("1.1.1")) assert(FireHiveConf.hiveCatalogName.equals("hive_catalog")) assert(FireHiveConf.partitionName.equals("dt")) this.logInfo("assert hive annotation success.") } @Test @TestStep(step = 1, desc = "hbase 注解断言") def tesHBase: Unit = { assert(FireHBaseConf.hbaseCluster().equals("batch-new1")) assert(FireHBaseConf.hbaseCluster(2).equals("batch-new2")) assert(FireHBaseConf.hbaseCluster(3).equals("batch-new3")) assert(FireHBaseConf.hbaseDurability(1).equals("off")) assert(!FireHBaseConf.tableExistsCache(2)) assert(FireHBaseConf.familyName(3).equals("data")) assert(FireHBaseConf.hbaseHadoopScanPartitions() == 12) assert(FireHBaseConf.hbaseHadoopScanPartitions(2) == 1200) assert(FireHBaseConf.hbaseBatchSize() == 10) assert(FireHBaseConf.hbaseBatchSize(2) == 10) assert(FireHBaseConf.hbaseMaxRetry(3) == 5) assert(FireHBaseConf.hbaseMaxRetry(2) == 3) assert(FireHBaseConf.hbaseStorageLevel(2).equals("MEMORY_ONLY")) assert(FireHBaseConf.hbaseBatchSize() == 10) assert(FireHBaseConf.hbaseHadoopScanPartitions(3) == 11) assert(this.conf.getString("flink.fire.hbase.conf.hbase.zookeeper.property.clientPort").equals("2181")) assert(this.conf.getString("fire.hbase.conf.zookeeper.znode.parent2").equals("/hbase2")) assert(this.conf.getString("flink.fire.hbase.conf.hbase.zookeeper.property.clientPort3").equals("2183")) } } ================================================ FILE: fire-examples/flink-examples/src/test/scala/com/zto/fire/examples/flink/core/BaseFlinkTester.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.core import com.zto.fire.flink.FlinkStreaming import org.junit.{After, Before} /** * Flink 单元测试父接口,用于初始化fire与flink上下文 * * @author ChengLong * @date 2022-05-17 09:55:30 * @since 2.2.2 */ trait BaseFlinkTester extends FlinkStreaming { /** * 初始化fire框架与flink相关的运行时上下文 */ @Before def before: Unit = { this.init() } @After override def after: Unit = { } } ================================================ FILE: fire-examples/flink-examples/src/test/scala/com/zto/fire/examples/flink/jdbc/JdbcUnitTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.flink.jdbc import com.zto.fire._ import com.zto.fire.common.util.{DateFormatUtils, JSONUtils} import com.zto.fire.core.anno.connector.Kafka import com.zto.fire.examples.bean.Student import com.zto.fire.flink.FlinkStreaming import org.apache.flink.api.scala._ import org.apache.flink.streaming.api.scala.DataStream /** * flink jdbc sink * * @author ChengLong * @since 1.1.0 * @create 2020-05-22 11:10 */ @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire", autoCommit = true) object JdbcUnitTest extends FlinkStreaming { lazy val tableName = "spark_test" lazy val tableName2 = "spark_test2" val fields = "name, age, createTime, length, sex".split(",") def sql(tableName: String): String = s"INSERT INTO $tableName (${fields.mkString(",")}) VALUES (?, ?, ?, ?, ?)" /** * table的jdbc sink */ def testTableJdbcSink(stream: DataStream[Student]): Unit = { stream.createOrReplaceTempView("student") val table = this.fire.sqlQuery("select name, age, createTime, length, sex from student group by name, age, createTime, length, sex") // 方式一、table中的列顺序和类型需与jdbc sql中的占位符顺序保持一致 table.jdbcBatchUpdate(sql(this.tableName)).setParallelism(1) // 或者 this.fire.jdbcBatchUpdateTable(table, sql(this.tableName2)).setParallelism(1) // 方式二、自定义row取数规则,适用于row中的列个数和顺序与sql占位符不一致的情况 table.jdbcBatchUpdate2(sql(this.tableName), flushInterval = 10000, keyNum = 2)(row => { Seq(row.getField(0), row.getField(1), row.getField(2), row.getField(3), row.getField(4)) }) // 或者 this.fire.jdbcBatchUpdateTable2(table, sql(this.tableName2), keyNum = 2)(row => { Seq(row.getField(0), row.getField(1), row.getField(2), row.getField(3), row.getField(4)) }).setParallelism(1) } /** * stream jdbc sink */ def testStreamJdbcSink(stream: DataStream[Student]): Unit = { // 方式一、指定字段列表,内部根据反射,自动获取DataStream中的数据并填充到sql中的占位符 // 此处fields有两层含义:1. sql中的字段顺序(对应表) 2. DataStream中的JavaBean字段数据(对应JavaBean) // 注:要保证DataStream中字段名称是JavaBean的名称,非表中字段名称 顺序要与占位符顺序一致,个数也要一致 stream.jdbcBatchUpdate(sql(this.tableName), fields, keyNum = 6).setParallelism(3) // 或者 this.fire.jdbcBatchUpdateStream(stream, sql(this.tableName2), fields, keyNum = 6).setParallelism(1) // 方式二、通过用户指定的匿名函数方式进行数据的组装,适用于上面方法无法反射获取值的情况,适用面更广 stream.jdbcBatchUpdate2(sql(this.tableName), 3, 30000, keyNum = 7) { // 在此处指定取数逻辑,定义如何将dstream中每列数据映射到sql中的占位符 value => Seq(value.getName, value.getAge, DateFormatUtils.formatCurrentDateTime(), value.getLength, value.getSex) }.setParallelism(1) // 或者 this.fire.jdbcBatchUpdateStream2(stream, sql(this.tableName2), keyNum = 7) { value => Seq(value.getName, value.getAge, DateFormatUtils.formatCurrentDateTime(), value.getLength, value.getSex) }.setParallelism(2) } override def process: Unit = { this.initData // 执行查询操作 val studentList = this.fire.jdbcQueryList(s"select * from $tableName", clazz = classOf[Student]) val dataStream = this.fire.fromCollection(studentList) dataStream.toTable.createOrReplaceTempView("test") this.fire.sql( """ |select * from test |""".stripMargin) dataStream.print() // 执行增删改操作 this.fire.jdbcUpdate(s"delete from $tableName") } /* @Test def testJdbc: Unit = { }*/ /** * 消费kafka * * @return * DataStream[Student] */ private def kafkaStream: DataStream[Student] = { this.fire.createKafkaDirectStream().filter(t => JSONUtils.isLegal(t)).map(json => { JSONUtils.parseObject[Student](json) }) } /** * 批量插入测试数据 */ private def initData: Unit = { this.truncate val timestamp = DateFormatUtils.formatCurrentDateTime() // 执行批量操作 val batchSql = s"INSERT INTO $tableName (name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" this.fire.jdbcBatchUpdate(batchSql, Seq(Seq("spark1", 21, timestamp, 100.123, 1), Seq("flink2", 22, timestamp, 12.236, 0), Seq("flink3", 22, timestamp, 12.236, 0), Seq("flink4", 22, timestamp, 12.236, 0), Seq("flink5", 27, timestamp, 17.236, 0))) } /** * 清空表 */ private def truncate: Unit = { this.fire.jdbcUpdate(s"truncate table $tableName") } } ================================================ FILE: fire-examples/pom.xml ================================================ 4.0.0 fire-examples pom Fire : Examples : spark-examples flink-examples com.zto.fire fire-parent 2.3.2-SNAPSHOT ../pom.xml com.zto.fire fire-common_${scala.binary.version} ${fire.version} com.zto.fire fire-core_${scala.binary.version} ${fire.version} com.zto.fire fire-connector-jdbc_${scala.binary.version} ${fire.version} com.zto.fire fire-connector-hbase_${scala.binary.version} ${fire.version} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-examples/spark-examples/pom.xml ================================================ 4.0.0 spark-examples_${spark.reference} jar Fire : Examples : Spark com.zto.fire fire-examples 2.3.2-SNAPSHOT ../pom.xml io.netty netty-all ${netty.version} ${maven.scope} com.zto.fire fire-common_${scala.binary.version} ${fire.version} com.zto.fire fire-core_${scala.binary.version} ${fire.version} com.zto.fire fire-spark_${spark.reference} ${project.version} com.zto.fire fire-enhance-spark_${spark.reference} ${fire.version} com.zto.fire fire-connector-spark-rocketmq_${spark.reference} ${fire.version} com.zto.fire fire-connector-spark-hbase_${spark.reference} ${fire.version} com.zto.fire fire-connector-hbase_${scala.binary.version} ${fire.version} com.zto.fire fire-connector-jdbc_${scala.binary.version} ${fire.version} com.zto.fire fire-enhance-arthas_${scala.binary.version} ${fire.version} com.zto.fire fire-metrics_${scala.binary.version} ${fire.version} com.fasterxml.jackson.core jackson-databind ${jackson.version} ${maven.scope} com.fasterxml.jackson.core jackson-core ${jackson.version} ${maven.scope} org.apache.spark spark-core_${scala.binary.version} com.esotericsoftware.kryo kryo ${spark.version} ${maven.scope} org.apache.spark spark-sql_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.spark spark-streaming_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.spark spark-sql-kafka-0-10_${scala.binary.version} ${spark.version} org.apache.spark spark-streaming_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.spark spark-streaming-kafka-0-10_${scala.binary.version} ${spark.version} org.apache.hadoop hadoop-common ${hadoop.version} ${maven.scope} org.apache.hadoop hadoop-hdfs ${hadoop.version} ${maven.scope} org.apache.hadoop hadoop-client ${hadoop.version} ${maven.scope} org.apache.hbase hbase-common ${hbase.version} org.apache.hbase hbase-client org.apache.hbase hbase-server ${hbase.version} org.apache.hbase hbase-client org.apache.hbase hbase-client ${hbase.version} org.apache.rocketmq rocketmq-client ${rocketmq.version} org.apache.hudi hudi-spark-bundle_${scala.binary.version} 0.7.0 ${maven.scope} ru.yandex.clickhouse clickhouse-jdbc 0.2.4 ${maven.scope} com.google.guava guava ${guava.version} ================================================ FILE: fire-examples/spark-examples/src/main/java/com/zto/fire/examples/bean/Hudi.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.bean; import com.zto.fire.common.util.DateFormatUtils; import com.zto.fire.common.util.JSONUtils; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.Arrays; import java.util.Date; import java.util.List; /** * @author ChengLong * @create 2021-02-07 16:45 * @since 1.0.0 */ public class Hudi { private Long id; private String name; private Integer age; private Boolean sex; private String createTime; private String ds; private static int num = 0; public Hudi(Long id, String name, Integer age, Boolean sex) { this.id = id; this.name = name; this.age = age; this.sex = sex; this.createTime = DateFormatUtils.formatCurrentDateTime(); if (num % 2 == 0) { this.ds = DateFormatUtils.formatBySchema(new Date(), "yyyyMMdd"); } else { this.ds = "20200206"; } num += 1; } public Hudi() { } public String getDs() { return ds; } public void setDs(String ds) { this.ds = ds; } public String getCreateTime() { return createTime; } public void setCreateTime(String createTime) { this.createTime = createTime; } public Hudi(Long id) { this.id = id; } public Boolean getSex() { return sex; } public void setSex(Boolean sex) { this.sex = sex; } public Long getId() { return id; } public void setId(Long id) { this.id = id; } public String getName() { return name; } public void setName(String name) { this.name = name; } public Integer getAge() { return age; } public void setAge(Integer age) { this.age = age; } @Override public String toString() { return JSONUtils.toJSONString(this); } public static List newHudiList() { return Arrays.asList( new Hudi(1L, "admin", 12, true), new Hudi(2L, "root", 22, true), new Hudi(3L, "scala", 11, true), new Hudi(4L, "spark", 15, true), new Hudi(5L, "java", 16, true), new Hudi(6L, "hive", 17, true), new Hudi(7L, "presto", 18, true), new Hudi(8L, "flink", 19, true), new Hudi(9L, "streaming", 20, true), new Hudi(10L, "sql", 12, true) ); } public static void main(String[] args) { LocalDateTime dateTime = LocalDateTime.of(2020, 2, 8, 15, 50, 30); dateTime.plusYears(1); System.out.println(dateTime.format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))); System.out.println(LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME)); } } ================================================ FILE: fire-examples/spark-examples/src/main/java/com/zto/fire/examples/bean/Student.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.bean; import com.zto.fire.common.util.DateFormatUtils; import com.zto.fire.common.util.JSONUtils; import com.zto.fire.hbase.bean.HBaseBaseBean; import com.zto.fire.spark.bean.GenerateBean; import java.math.BigDecimal; import java.util.Arrays; import java.util.List; import java.util.Objects; /** * 对应HBase表的JavaBean * * @author ChengLong 2019-6-20 16:06:16 */ // @HConfig(multiVersion = true) // @HConfig(nullable = true, multiVersion = true, versions = 3) public class Student extends HBaseBaseBean implements GenerateBean { protected Long id; protected String name; protected Integer age; // 多列族情况下需使用family单独指定 protected String createTime; // 若JavaBean的字段名称与HBase中的字段名称不一致,需使用value单独指定 // 此时hbase中的列名为length1,而不是length //@FieldName(family = "data", value = "length1") protected BigDecimal length; protected Boolean sex; /** * rowkey的构建 * * @return */ @Override public Student buildRowKey() { this.rowKey = this.id.toString(); return this; } public Student(Long id, String name) { this.id = id; this.name = name; } public Student(Long id, String name, Integer age) { this.id = id; this.name = name; this.age = age; } public Student(Long id, String name, Integer age, BigDecimal length, Boolean sex, String createTime) { this.id = id; this.name = name; this.age = age; this.length = length; this.sex = sex; this.createTime = createTime; } public Student(Long id, String name, Integer age, BigDecimal length) { this.id = id; this.name = name; this.age = age; this.length = length; } public Student() { } public Student(Long id) { this.id = id; } public String getCreateTime() { return createTime; } public void setCreateTime(String createTime) { this.createTime = createTime; } public BigDecimal getLength() { return length; } public void setLength(BigDecimal length) { this.length = length; } public Boolean getSex() { return sex; } public void setSex(Boolean sex) { this.sex = sex; } public Long getId() { return id; } public void setId(Long id) { this.id = id; } public String getName() { return name; } public void setName(String name) { this.name = name; } public Integer getAge() { return age; } public void setAge(Integer age) { this.age = age; } @Override public String toString() { return JSONUtils.toJSONString(this); } @Override public List generate() { return newStudentList(); } public static List newStudentList() { String dateTime = DateFormatUtils.formatCurrentDateTime(); return Arrays.asList( new Student(1L, "admin", 12, BigDecimal.valueOf(12.1), true, dateTime), new Student(2L, "root", 22, BigDecimal.valueOf(22), true, dateTime), new Student(3L, "scala", 11, BigDecimal.valueOf(11), true, dateTime), new Student(4L, "spark", 15, BigDecimal.valueOf(15), true, dateTime), new Student(5L, "java", 16, BigDecimal.valueOf(16.1), true, dateTime), new Student(6L, "hive", 17, BigDecimal.valueOf(17.1), true, dateTime), new Student(7L, "presto", 18, BigDecimal.valueOf(18.1), true, dateTime), new Student(8L, "flink", 19, BigDecimal.valueOf(19.1), true, dateTime), new Student(9L, "streaming", 10, BigDecimal.valueOf(10.1), true, dateTime) ); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (!(o instanceof Student)) { return false; } Student student = (Student) o; return Objects.equals(id, student.id) && Objects.equals(name, student.name) && Objects.equals(age, student.age) && Objects.equals(createTime, student.createTime) && Objects.equals(length, student.length) && Objects.equals(sex, student.sex); } @Override public int hashCode() { return Objects.hash(id, name, age, createTime, length, sex); } } ================================================ FILE: fire-examples/spark-examples/src/main/java/com/zto/fire/examples/bean/StudentMulti.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.bean; import com.zto.fire.common.util.DateFormatUtils; import com.zto.fire.common.util.JSONUtils; import com.zto.fire.hbase.anno.HConfig; import com.zto.fire.hbase.bean.HBaseBaseBean; import java.math.BigDecimal; import java.util.Arrays; import java.util.List; import java.util.Objects; /** * 对应HBase表的JavaBean * * @author ChengLong * @date 2022-05-11 13:41:42 * @since 2.2.2 */ @HConfig(nullable = true, multiVersion = true, versions = 3) public class StudentMulti extends HBaseBaseBean { protected Long id; protected String name; protected Integer age; // 多列族情况下需使用family单独指定 protected String createTime; // 若JavaBean的字段名称与HBase中的字段名称不一致,需使用value单独指定 // 此时hbase中的列名为length1,而不是length //@FieldName(family = "data", value = "length1") protected BigDecimal length; protected Boolean sex; /** * rowkey的构建 * * @return */ @Override public StudentMulti buildRowKey() { this.rowKey = this.id.toString(); return this; } public StudentMulti(Long id, String name) { this.id = id; this.name = name; } public StudentMulti(Long id, String name, Integer age) { this.id = id; this.name = name; this.age = age; } public StudentMulti(Long id, String name, Integer age, BigDecimal length, Boolean sex, String createTime) { this.id = id; this.name = name; this.age = age; this.length = length; this.sex = sex; this.createTime = createTime; } public StudentMulti(Long id, String name, Integer age, BigDecimal length) { this.id = id; this.name = name; this.age = age; this.length = length; } public StudentMulti() { } public StudentMulti(Long id) { this.id = id; } public String getCreateTime() { return createTime; } public void setCreateTime(String createTime) { this.createTime = createTime; } public BigDecimal getLength() { return length; } public void setLength(BigDecimal length) { this.length = length; } public Boolean getSex() { return sex; } public void setSex(Boolean sex) { this.sex = sex; } public Long getId() { return id; } public void setId(Long id) { this.id = id; } public String getName() { return name; } public void setName(String name) { this.name = name; } public Integer getAge() { return age; } public void setAge(Integer age) { this.age = age; } @Override public String toString() { return JSONUtils.toJSONString(this); } public List generate() { return newStudentMultiList(); } public static List newStudentMultiList() { String dateTime = DateFormatUtils.formatCurrentDateTime(); return Arrays.asList( new StudentMulti(1L, "admin", 12, BigDecimal.valueOf(12.1), true, dateTime), new StudentMulti(2L, "root", 22, BigDecimal.valueOf(22), true, dateTime), new StudentMulti(3L, "scala", 11, BigDecimal.valueOf(11), true, dateTime), new StudentMulti(4L, "spark", 15, BigDecimal.valueOf(15), true, dateTime), new StudentMulti(5L, "java", 16, BigDecimal.valueOf(16.1), true, dateTime), new StudentMulti(6L, "hive", 17, BigDecimal.valueOf(17.1), true, dateTime), new StudentMulti(7L, "presto", 18, BigDecimal.valueOf(18.1), true, dateTime), new StudentMulti(8L, "flink", 19, BigDecimal.valueOf(19.1), true, dateTime), new StudentMulti(9L, "streaming", 10, BigDecimal.valueOf(10.1), true, dateTime) ); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (!(o instanceof StudentMulti)) { return false; } StudentMulti StudentMulti = (StudentMulti) o; return Objects.equals(id, StudentMulti.id) && Objects.equals(name, StudentMulti.name) && Objects.equals(age, StudentMulti.age) && Objects.equals(createTime, StudentMulti.createTime) && Objects.equals(length, StudentMulti.length) && Objects.equals(sex, StudentMulti.sex); } @Override public int hashCode() { return Objects.hash(id, name, age, createTime, length, sex); } } ================================================ FILE: fire-examples/spark-examples/src/main/resources/common.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # spark.streaming.stopGracefullyOnShutdown = false spark.redaction.regex = (?i)secret|password fire.analysis.arthas.tunnel_server.url = ws://arthas_tunnel_server:7777/ws # \u5B9A\u4E49url\u7684\u522B\u540D\u4E0Eurl\u5BF9\u5E94\u5173\u7CFB\uFF0C\u540E\u7EED\u53EF\u901A\u8FC7\u522B\u540D\u8FDB\u884C\u914D\u7F6E spark.db.jdbc.url.map.test = jdbc:mysql://mysql-server:3306/fire # \u652F\u6301\u522B\u540D\u6216\u76F4\u63A5\u6307\u5B9Aurl spark.db.jdbc.url = test spark.db.jdbc.driver = com.mysql.jdbc.Driver spark.db.jdbc.user = root spark.db.jdbc.password = fire spark.db.jdbc.batch.size = 10 # \u914D\u7F6E\u53E6\u4E00\u4E2A\u6570\u636E\u6E90\uFF0C\u5BF9\u5E94\u7684\u64CD\u4F5C\u9700\u5BF9\u5E94\u52A0\u6570\u5B57\u540E\u7F00\uFF0C\u5982\uFF1Athis.spark.jdbcQueryDF2(sql, Seq(1, 2, 3), classOf[Student]) spark.db.jdbc.url2 = jdbc:mysql://mysql-server:3306/fire2 spark.db.jdbc.driver2 = com.mysql.jdbc.Driver spark.db.jdbc.user2 = root spark.db.jdbc.password2 = fire # \u6BCF\u4E2A\u6279\u6B21\u63D0\u4EA4\u7684\u6570\u636E\u5927\u5C0F\uFF0C\u9ED8\u8BA41000\u6761 spark.db.jdbc.batch.size2 = 2 spark.db.jdbc.url6 = jdbc:mysql://mysql-server:3306/fire6 spark.db.jdbc.driver6 = com.mysql.jdbc.Driver spark.db.jdbc.user6 = root spark.db.jdbc.password6 = fire # \u4E8B\u52A1\u7684\u9694\u79BB\u7EA7\u522BNONE, READ_COMMITTED, READ_UNCOMMITTED, REPEATABLE_READ, SERIALIZABLE\uFF0C\u9ED8\u8BA4\u4E3AREAD_UNCOMMITTED spark.db.jdbc.isolation.level6 = none # \u6BCF\u4E2A\u6279\u6B21\u63D2\u5165\u3001\u66F4\u65B0\u3001\u5220\u9664\u7684\u6570\u636E\u91CF\uFF0C\u9ED8\u8BA4\u4E3A1000 spark.db.jdbc.batch.size6 = 2000 spark.db.jdbc.url7 = jdbc:mysql://mysql-server:3306/fire7 spark.db.jdbc.driver7 = com.mysql.jdbc.Driver spark.db.jdbc.user7 = root spark.db.jdbc.password7 = fire spark.db.jdbc.url8 = jdbc:mysql://mysql-server:3306/fire8 spark.db.jdbc.driver8 = com.mysql.jdbc.Driver spark.db.jdbc.user8 = root spark.db.jdbc.password8 = fire fire.rest.filter.enable = false fire.rest.url.show.enable = true # hive\u76F8\u5173set\u8BED\u53E5 hive.exec.dynamic.partition = true hive.exec.dynamic.partition.mode = nonstrict hive.exec.max.dynamic.partitions = 5000 hive.exec.max.dynamic.partitions.pernode = 5000 hive.merge.mapredfiles = true hive.optimize.sort.dynamic.partition = true # ---------------------------------------------- < \u5F02\u5E38\u8BCA\u65AD\u914D\u7F6E > ----------------------------------------------- # fire.analysis.log.exception.send.mq.url = bigdata_test fire.analysis.log.exception.stack.enable = false # \u662F\u5426\u6253\u5370\u914D\u7F6E\u4FE1\u606F fire.conf.show.enable = true # ---------------------------------------------- < \u8840\u7F18\u91C7\u96C6\u914D\u7F6E > ----------------------------------------------- # # \u662F\u5426\u5F00\u542F\u5C06\u8840\u7F18\u4FE1\u606F\u53D1\u9001\u5230\u6D88\u606F\u961F\u5217 fire.lineage.send.mq.enable = true fire.lineage.send.mq.url = bigdata_test # \u5B9A\u65F6\u89E3\u6790\u57CB\u70B9SQL\u7684\u6267\u884C\u9891\u7387\uFF08s\uFF09 fire.lineage.run.period = 10 ================================================ FILE: fire-examples/spark-examples/src/main/resources/jdbc/JdbcTest.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################################### # JDBC数据源配置信息详见:common.properties,公共数据源配置可放到common.properties中,便于维护 # ######################################################################################### # 非必须配置项:spark 任务的appName,不配置则取类名 # spark.appName = test tableName = spark_test #tableName = t_hosts spark.log.level = INFO spark.fire.jdbc.storage.level = DISK_ONLY spark.fire.jdbc.query.partitions = 12 spark.fire.acc.enable = true spark.log.level.fire_conf.com.zto.fire= info # fire框架埋点日志开关,关闭以后将不再打印埋点日志 spark.fire.log.enable = true # 用于限定fire框架中sql日志的字符串长度 spark.fire.log.sql.length = 100 #spark.fire.jdbc.storage.level = memory_and_disk_ser # 通过JdbcConnector查询后将数据集放到多少个分区中,需根据实际的结果集做配置 #spark.fire.jdbc.query.partitions = 10 spark.fire.rest.filter.enable = false hello.world = 2020 spark.fire.config_center.enable = true hello.world.flag = false hello.world.flag2 = false # c3p0参数,以db.c3p0.conf.开头,以keyNum结尾表示不同的数据库实例 db.c3p0.conf.minPoolSize = 10 db.c3p0.conf.maxPoolSize = 20 db.c3p0.conf.autoCommitOnClose = true db.c3p0.conf.AutomaticTestTable = test fire.config_center.local.enable = true spark.fire.config_center.local.enable = true ================================================ FILE: fire-examples/spark-examples/src/main/resources/streaming/ConfigCenterTest.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # #fire.thread.pool.size=10 fire.thread.pool.size=6 fire.restful.max.thread=9 fire.jdbc.query.partitions=11 fire.hbase.scan.repartitions=110 fire.acc.log.max.size=22 fire.conf.test=scala ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/SparkDemo.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.core.anno.connector.{Hive, Kafka} import com.zto.fire.core.anno.lifecycle.Process import com.zto.fire.spark.SparkStreaming import com.zto.fire.spark.anno.Streaming /** * 基于Fire进行Spark Streaming开发 * * @contact Fire框架技术交流群(钉钉):35373471 */ @Config( """ |# 支持Spark调优参数、Fire框架参数、用户自定义参数等 |spark.shuffle.compress=true |spark.ui.enabled=true |""") @Hive("thrift://localhost:9083") // 配置连接到指定的hive @Streaming(interval = 100, maxRatePerPartition = 100) // 100s一个Streaming batch,并限制消费速率 @Kafka(brokers = "localhost:9092", topics = "fire", groupId = "fire") object SparkDemo extends SparkStreaming { @Process def kafkaSource: Unit = { val dstream = this.fire.createKafkaDirectStream() // 使用api的方式消费kafka sql("""select * from xxx""").show() } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/SparkSQLDemo.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark import com.zto.fire.core.anno.lifecycle.{Step1, Step2} import com.zto.fire.examples.bean.Student import com.zto.fire.spark.SparkCore /** * 基于Fire进行spark sql开发 * 使用@Step注解的无参方法将按数值顺序依次被Fire框架调用: *

Step1. 定义数据集 * Step1. 执行耗时:482.00ms * * Step2. 统计记录数 * +--------+ * |count(1)| * +--------+ * | 9| * +--------+ * Step2. 执行耗时:1.43s * * Finished. 总计:2个 成功:2个 失败:0个, 执行耗时:1.92s

* * @author ChengLong * @contact Fire框架技术交流群(钉钉):35373471 */ object SparkSQLDemo extends SparkCore { @Step1("定义数据集") def createDF: Unit = { val df = this.fire.createDataFrame(Student.newStudentList(), classOf[Student]) df.createOrReplaceTempView("student") } @Step2("统计记录数") def count: Unit = { sql( """ |select count(1) from student; | |select count(1) from student; |""".stripMargin).show() } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/Test.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.common.util.{DateFormatUtils, JSONUtils, ThreadUtils} import com.zto.fire.core.anno.connector._ import com.zto.fire.examples.bean.Student import com.zto.fire.hbase.HBaseConnector import com.zto.fire.predef.println import com.zto.fire.spark.SparkCore import com.zto.fire.spark.sync.SparkLineageAccumulatorManager import java.util.concurrent.TimeUnit /** * 基于Fire进行Spark Streaming开发 * * @contact Fire框架技术交流群(钉钉):35373471 */ @HBase("test") @Config( """ |fire.lineage.run.initialDelay=10 |fire.shutdown.auto.exit=false |""") @Hive("test") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") @RocketMQ(brokers = "bigdata_test", topics = "fire2", groupId = "fire") @Jdbc(url = "jdbc:mysql://mysql-server:3306/fire", username = "root", password = "root") object Test extends SparkCore { private val hbaseTable = "fire_test_1" private lazy val tableName = "spark_test" override def process: Unit = { ThreadUtils.scheduleAtFixedRate({ println(s"累加器值:" + JSONUtils.toJSONString(SparkLineageAccumulatorManager.getValue)) }, 0, 60, TimeUnit.SECONDS) this.fire.createDataFrame(Student.newStudentList(), classOf[Student]).createOrReplaceTempView("student") sql( s""" |create table if not exists tmp.zto_fire_test |select a.*,'sh' as city |from dw.mdb_md_dbs a left join student t on a.ds=t.name |where ds='20211001' limit 100 |""".stripMargin) (1 to 10).foreach(x => { val df = this.fire.createDataFrame(Student.newStudentList(), classOf[Student]) df.rdd.foreachPartition(it => { val timestamp = DateFormatUtils.formatCurrentDateTime() val insertSql = s"INSERT INTO $tableName (name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" this.fire.jdbcUpdate(insertSql, Seq("admin", 12, timestamp, 10.0, 1)) HBaseConnector.get[Student](hbaseTable, classOf[Student], Seq("1")) }) Thread.sleep(10000) }) } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/acc/FireAccTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.acc import com.zto.fire._ import com.zto.fire.common.anno.Scheduled import com.zto.fire.common.util.{DateFormatUtils, PropUtils, ThreadUtils} import com.zto.fire.core.anno.connector.{Hive, Kafka} import com.zto.fire.spark.SparkStreaming import com.zto.fire.spark.anno.Streaming import java.util.concurrent.TimeUnit /** * 用于演示与测试Fire框架内置的累加器 * * @author ChengLong 2019年9月10日 09:50:16 * @contact Fire框架技术交流群(钉钉):35373471 */ @Streaming(10) @Hive("test") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object FireAccTest extends SparkStreaming { val key = "fire.partitions" override def process: Unit = { if (this.args != null) { this.args.foreach(arg => println(arg + " ")) } val dstream = this.fire.createKafkaDirectStream() dstream.foreachRDD(rdd => { rdd.coalesce(this.conf.getInt(key, 10)).foreachPartition(t => { println("conf=" + this.conf.getInt(key, 10) + " PropUtils=" + PropUtils.getString(key)) // 单值累加器 this.acc.addCounter(1) // 多值累加器,根据key的不同分别进行数据的累加 this.acc.addMultiCounter("multiCounter", 1) this.acc.addMultiCounter("partitions", 1) // 多时间维度累加器,比多值累加器多了一个时间维度,如:hbaseWriter 2019-09-10 11:00:00 10 this.acc.addMultiTimer("multiTimer", 1) }) }) // 定时打印fire内置累加器中的值 ThreadUtils.schedule(this.printAcc, 0, 10, true, TimeUnit.MINUTES) } /** * 打印累加器中的值 */ def printAcc: Unit = { println(s"===============${DateFormatUtils.formatCurrentDateTime()}=============") this.acc.getMultiTimer.cellSet().foreach(t => println(s"key:" + t.getRowKey + " 时间:" + t.getColumnKey + " " + t.getValue + "条")) println("单值:" + this.acc.getCounter) this.acc.getMultiCounter.foreach(t => { println("多值:key=" + t._1 + " value=" + t._2) }) val size = this.acc.getMultiTimer.cellSet().size() println(s"======multiTimer.size=${size}==log.size=${this.acc.getLog.size()}======") } @Scheduled(fixedInterval = 60 * 1000, scope = "all") def loadTable: Unit = { println(s"${DateFormatUtils.formatCurrentDateTime()}=================== 每分钟执行loadTable ===================") } @Scheduled(cron = "0 0 * * * ?") def loadTable2: Unit = { println(s"${DateFormatUtils.formatCurrentDateTime()}=================== 每小时执行loadTable2 ===================") } @Scheduled(cron = "0 0 9 * * ?") def loadTable3: Unit = { println(s"${DateFormatUtils.formatCurrentDateTime()}=================== 每天9点执行loadTable3 ===================") } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/hbase/HBaseConnectorTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.hbase import java.nio.charset.StandardCharsets import com.zto.fire._ import com.zto.fire.core.anno.connector.{HBase, HBase2} import com.zto.fire.examples.bean.Student import com.zto.fire.hbase.HBaseConnector import com.zto.fire.spark.SparkCore import org.apache.hadoop.hbase.client.Get import org.apache.spark.sql.Encoders import scala.collection.mutable.ListBuffer /** * 在spark中使用java 同步 api (HBaseConnector) 的方式读写hbase表 * 注:适用于少量数据的实时读写,更轻量 * * @author ChengLong 2019-5-9 09:37:25 * @contact Fire框架技术交流群(钉钉):35373471 */ @HBase("test") @HBase2(cluster = "test", scanPartitions = 3, storageLevel = "DISK_ONLY") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object HBaseConnectorTest extends SparkCore { private val tableName1 = "fire_test_1" private val tableName2 = "fire_test_2" /** * 使用HBaseConnector插入一个集合,可以是list、set等集合 * 但集合的类型必须为HBaseBaseBean的子类 */ def testHbasePutList: Unit = { val studentList = Student.newStudentList() this.fire.hbasePutList(this.tableName1, studentList) } /** * 使用HBaseConnector插入一个rdd的数据 * rdd的类型必须为HBaseBaseBean的子类 */ def testHbasePutRDD: Unit = { val studentList = Student.newStudentList() val studentRDD = this.fire.createRDD(studentList, 2) // 为空的字段不插入 studentRDD.hbasePutRDD(this.tableName1) } /** * 使用HBaseConnector插入一个DataFrame的数据 */ def testHBasePutDF: Unit = { val studentList = Student.newStudentList() val studentDF = this.fire.createDataFrame(studentList, classOf[Student]) // 每个批次插100条 studentDF.hbasePutDF(this.tableName1, classOf[Student]) } /** * 使用HBaseConnector插入一个Dataset的数据 * dataset的类型必须为HBaseBaseBean的子类 */ def testHBasePutDS: Unit = { val studentList = Student.newStudentList() val studentDS = this.fire.createDataset(studentList)(Encoders.bean(classOf[Student])) // 以多版本形式插入 studentDS.hbasePutDS(this.tableName2, classOf[Student]) } /** * 使用HBaseConnector get数据,并将结果以list方式返回 */ def testHbaseGetList: Unit = { println("===========testHbaseGetList===========") val rowKeys = Seq("1", "2", "3", "5", "6") val studentList = this.fire.hbaseGetList2(this.tableName1, classOf[Student], rowKeys) studentList.foreach(println) val getList = ListBuffer[Get]() rowKeys.map(rowkey => (getList += new Get(rowkey.getBytes(StandardCharsets.UTF_8)))) // 获取多版本形式存放的记录,并获取最新的两个版本就 val studentList2 = this.fire.hbaseGetList(this.tableName1, classOf[Student], getList) studentList2.foreach(println) } /** * 使用HBaseConnector get数据,并将结果以RDD方式返回 */ def testHbaseGetRDD: Unit = { println("===========testHBaseConnectorGetRDD===========") val getList = Seq("1", "2", "3", "5", "6") val getRDD = this.fire.createRDD(getList, 2) // 以多版本方式get,并将结果集封装到rdd中返回 val studentRDD = this.fire.hbaseGetRDD(this.tableName1, classOf[Student], getRDD) studentRDD.printEachPartition } /** * 使用HBaseConnector get数据,并将结果以DataFrame方式返回 */ def testHbaseGetDF: Unit = { println("===========testHBaseConnectorGetDF===========") val getList = Seq("1", "2", "3", "4", "5", "6") val getRDD = this.fire.createRDD(getList, 3) // get到的结果以dataframe形式返回 val studentDF = this.fire.hbaseGetDF(this.tableName1, classOf[Student], getRDD) studentDF.show(100, false) } /** * 使用HBaseConnector get数据,并将结果以Dataset方式返回 */ def testHBaseGetDS: Unit = { println("===========testHBaseGetDS===========") val getList = Seq("1", "2", "3", "4", "5", "6") val getRDD = this.fire.createRDD(getList, 2) // 指定在多版本获取时只取最新的两个版本 val studentDS = this.fire.hbaseGetDS(this.tableName1, classOf[Student], getRDD) studentDS.show(100, false) } /** * 使用HBaseConnector scan数据,并以list方式返回 */ def testHbaseScanList: Unit = { println("===========testHbaseScanList===========") val list = this.fire.hbaseScanList2(this.tableName1, classOf[Student], "1", "6") list.foreach(println) } /** * 使用HBaseConnector scan数据,并以RDD方式返回 */ def testHbaseScanRDD: Unit = { println("===========testHbaseScanRDD===========") val rdd = this.fire.hbaseScanRDD2(this.tableName1, classOf[Student], "1", "6") rdd.repartition(3).printEachPartition } /** * 使用HBaseConnector scan数据,并以DataFrame方式返回 */ def testHbaseScanDF: Unit = { println("===========testHbaseScanDF===========") val dataFrame = this.fire.hbaseScanDF2(this.tableName1, classOf[Student], "1", "6") dataFrame.repartition(3).show(100, false) } /** * 使用HBaseConnector scan数据,并以DataFrame方式返回 */ def testHbaseScanDS: Unit = { println("===========testHbaseScanDF===========") val dataSet = this.fire.hbaseScanDS2(this.tableName1, classOf[Student], "1", "6") dataSet.show(100, false) } /** * 根据指定的rowKey list,批量删除指定的记录 */ def testHbaseDeleteList: Unit = { val rowKeyList = Seq(1.toString, 2.toString, 5.toString, 8.toString) this.fire.hbaseDeleteList(this.tableName1, rowKeyList) } /** * 根据指定的rowKey rdd,批量删除指定的记录 */ def testHBaseDeleteRDD: Unit = { val rowKeyList = Seq(1.toString, 2.toString, 3.toString, 4.toString, 5.toString, 6.toString, 7.toString, 8.toString, 9.toString, 10.toString) val rowKeyRDD = this.fire.createRDD(rowKeyList, 2) rowKeyRDD.hbaseDeleteRDD(this.tableName1) } /** * 根据指定的rowKey dataset,批量删除指定的记录 */ def testHbaseDeleteDS: Unit = { val rowKeyList = Seq(1.toString, 2.toString, 5.toString, 8.toString) val rowKeyDS = this.fire.createDataset(rowKeyList)(Encoders.STRING) rowKeyDS.hbaseDeleteDS(this.tableName1) } /** * 多版本get与scan */ def testMutiVersion: Unit = { this.testHBasePutDF this.testHBasePutDF this.testHBasePutDF this.testHBasePutDF print("======testHbaseGetList=======") this.testHbaseGetList print("======testHbaseGetRDD=======") this.testHbaseGetRDD print("======testHbaseGetDF=======") this.testHbaseGetDF print("======testHBaseGetDS=======") this.testHBaseGetDS println("==========scan============") print("======testHbaseScanList=======") this.testHbaseScanList print("======testHbaseScanRDD=======") this.testHbaseScanRDD print("======testHbaseScanDF=======") this.testHbaseScanDF print("======testHbaseScanDS=======") this.testHbaseScanDS } /** * Spark处理过程 * 注:此方法会被自动调用 */ override def process: Unit = { // 指定是否以多版本的形式读写 // this.testHBaseDeleteRDD this.testHbaseDeleteDS HBaseConnector.truncateTable(this.tableName1) HBaseConnector.truncateTable(this.tableName2, keyNum = 2) this.testHbasePutRDD this.testHbasePutList // HBaseConnector.truncateTable(this.tableName1) this.testHbaseGetDF this.testHBasePutDS // this.testMutiVersion println("=========get========") this.testHbaseGetList this.testHbaseGetRDD this.testHbaseGetDF this.testHBaseGetDS println("=========scan========") this.testHbaseScanList this.testHbaseScanRDD this.testHbaseScanDF this.testHbaseScanDF val getList = ListBuffer(HBaseConnector.buildGet("1")) val student = HBaseConnector.get(this.tableName1, classOf[Student], getList, 1) println(student.toString()) } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/hbase/HBaseHadoopTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.hbase import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.examples.bean.Student import com.zto.fire.hbase.HBaseConnector import com.zto.fire.spark.SparkCore import org.apache.spark.sql.{Encoders, Row} /** * 本示例演示Spark提供的hbase api封装后的使用 * 注:使用Spark写hbase的方式适用于海量数据离线写 * * @author ChengLong 2019-5-9 09:37:25 * @contact Fire框架技术交流群(钉钉):35373471 */ @Config( """ |# 用于区分不同的hbase集群: batch/streaming/old |spark.hbase.cluster = test |spark.hbase.cluster2 = test |# 通过HBase scan后repartition的分区数,需根据scan后的数据量做配置 |spark.fire.hbase.scan.partitions = 3 |spark.fire.hbase.storage.level = DISK_ONLY |""") object HBaseHadoopTest extends SparkCore { private val tableName6 = "fire_test_6" private val tableName7 = "fire_test_7" /** * 基于saveAsNewAPIHadoopDataset封装,将rdd数据保存到hbase中 */ def testHbaseHadoopPutRDD: Unit = { val studentRDD = this.fire.createRDD(Student.newStudentList(), 2) this.fire.hbaseHadoopPutRDD(this.tableName6, studentRDD, keyNum = 2) // 方式二:直接基于rdd进行方法调用 // studentRDD.hbaseHadoopPutRDD(this.tableName1) } /** * 基于saveAsNewAPIHadoopDataset封装,将DataFrame数据保存到hbase中 */ def testHbaseHadoopPutDF: Unit = { val studentRDD = this.fire.createRDD(Student.newStudentList(), 2) val studentDF = this.fire.createDataFrame(studentRDD, classOf[Student]) // 由于DataFrame相较于Dataset和RDD是弱类型的数据集合,所以需要传递具体的类型classOf[Type] this.fire.hbaseHadoopPutDF(this.tableName7, studentDF, classOf[Student]) // 方式二:基于DataFrame进行方法调用 // studentDF.hbaseHadoopPutDF(this.tableName3, classOf[Student]) } /** * 基于saveAsNewAPIHadoopDataset封装,将Dataset数据保存到hbase中 */ def testHbaseHadoopPutDS: Unit = { val studentDS = this.fire.createDataset(Student.newStudentList())(Encoders.bean(classOf[Student])) this.fire.hbaseHadoopPutDS(this.tableName7, studentDS) // 方式二:基于DataFrame进行方法调用 // studentDS.hbaseHadoopPutDS(this.tableName3) } /** * 基于saveAsNewAPIHadoopDataset封装,将不是HBaseBaseBean结构对应的DataFrame保存到hbase中 * 注:此方法与hbaseHadoopPutDF不同之处在于,它不强制要求该DataFrame一定要与HBaseBaseBean的子类对应 * 但需要指定rowKey的构建规则,相对与hbaseHadoopPutDF来说,少了中间的两次转换,性能会更高 */ def testHbaseHadoopPutDFRow: Unit = { /** * 构建main_order rowkey */ val buildRowKey = (row: Row) => { // 将id字段作为rowKey row.getAs("id").toString } val studentRDD = this.fire.createRDD(Student.newStudentList(), 2) this.fire.createDataFrame(studentRDD, classOf[Student]).createOrReplaceTempView("student") // 指定rowKey构建的函数 sql("select age,createTime,id,length,name,sex from student").hbaseHadoopPutDFRow(this.tableName7, buildRowKey) } /** * 使用Spark的方式scan海量数据,并将结果集映射为RDD */ def testHBaseHadoopScanRDD: Unit = { println("===========testHBaseHadoopScanRDD===========") val studentRDD = this.fire.hbaseHadoopScanRDD2(this.tableName6, classOf[Student], "1", "6", keyNum = 2) studentRDD.printEachPartition } /** * 使用Spark的方式scan海量数据,并将结果集映射为DataFrame */ def testHBaseHadoopScanDF: Unit = { println("===========testHBaseHadoopScanDF===========") val studentDF = this.fire.hbaseHadoopScanDF2(this.tableName7, classOf[Student], "1", "6") studentDF.show(100, false) } /** * 使用Spark的方式scan海量数据,并将结果集映射为Dataset */ def testHBaseHadoopScanDS: Unit = { println("===========testHBaseHadoopScanDS===========") val studentDS = this.fire.hbaseHadoopScanDS2(this.tableName7, classOf[Student], "1", "6") studentDS.show(100, false) } /** * Spark处理过程 * 注:此方法会被自动调用 */ override def process: Unit = { HBaseConnector.truncateTable(this.tableName6, keyNum = 2) HBaseConnector.truncateTable(this.tableName7) this.testHbaseHadoopPutRDD // this.testHbaseHadoopPutDF // this.testHbaseHadoopPutDS this.testHbaseHadoopPutDFRow this.testHBaseHadoopScanRDD this.testHBaseHadoopScanDF this.testHBaseHadoopScanDS } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/hbase/HBaseStreamingTest.scala ================================================ package com.zto.fire.examples.spark.hbase import com.zto.fire._ import com.zto.fire.core.anno.connector.{HBase, HBase2, Kafka} import com.zto.fire.examples.bean.Student import com.zto.fire.hbase.HBaseConnector import com.zto.fire.spark.SparkStreaming import com.zto.fire.spark.anno.Streaming /** * 通过hbase相关api,将数据实时写入到hbase中 * @author ChengLong 2019-5-26 13:21:59 * @contact Fire框架技术交流群(钉钉):35373471 */ @HBase("test") @Streaming(interval = 30, concurrent = 2) @HBase2(cluster = "test", scanPartitions = 30, storageLevel = "DISK_ONLY") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object HBaseStreamingTest extends SparkStreaming { private val tableName8 = "fire_test_8" private val tableName9 = "fire_test_9" override def process: Unit = { val dstream = this.fire.createKafkaDirectStream() HBaseConnector.truncateTable(this.tableName8) HBaseConnector.truncateTable(this.tableName9, keyNum = 2) dstream.repartition(3).foreachRDD(rdd => { rdd.foreachPartition(it => { HBaseConnector.insert(this.tableName8, Student.newStudentList()) val student = HBaseConnector.get(this.tableName9, classOf[Student], Seq("1", "2")) student.foreach(t => logger.error("HBase1 Get结果:" + t)) HBaseConnector.insert(this.tableName9, Student.newStudentList()) val student2 = HBaseConnector.get(this.tableName8, classOf[Student], Seq("2", "3"), keyNum = 2) student2.foreach(t => logger.error("HBase2 Get结果:" + t)) }) }) } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/hbase/HbaseBulkTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.hbase import com.zto.fire._ import com.zto.fire.core.anno.connector.{HBase, HBase2} import com.zto.fire.examples.bean.Student import com.zto.fire.hbase.HBaseConnector import com.zto.fire.spark.SparkCore import org.apache.spark.sql.{Encoders, Row} /** * 本示例用于演示spark中使用bulk api完成HBase的读写 * 注:bulk api相较于java api,在速度上会更快,但目前暂不支持多版本读写 * * @author ChengLong 2019-5-18 09:20:52 * @contact Fire框架技术交流群(钉钉):35373471 */ @HBase("test") @HBase2(cluster = "test", scanPartitions = 3, storageLevel = "DISK_ONLY") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object HBaseBulkTest extends SparkCore { private val tableName3 = "fire_test_3" private val tableName5 = "fire_test_5" /** * 使用id作为rowKey */ val buildStudentRowKey = (row: Row) => { row.getAs("id").toString } /** * 使用bulk的方式将rdd写入到hbase */ def testHbaseBulkPutRDD: Unit = { // 方式一:将rdd的数据写入到hbase中,rdd类型必须为HBaseBaseBean的子类 val rdd = this.fire.createRDD(Student.newStudentList(), 2) // rdd.hbaseBulkPutRDD(this.tableName2) // 方式二:使用this.fire.hbaseBulkPut将rdd中的数据写入到hbase this.fire.hbaseBulkPutRDD(this.tableName5, rdd) // 第二个参数指定false表示不插入为null的字段到hbase中 // rdd.hbaseBulkPutRDD(this.tableName2, insertEmpty = false) // 第三个参数为true表示以多版本json格式写入 // rdd.hbaseBulkPutRDD(this.tableName3, false, true) } /** * 使用bulk的方式将DataFrame写入到hbase */ def testHbaseBulkPutDF: Unit = { // 方式一:将DataFrame的数据写入到hbase中 val rdd = this.fire.createRDD(Student.newStudentList(), 2) val studentDF = this.fire.createDataFrame(rdd, classOf[Student]) // insertEmpty=false表示为空的字段不插入 studentDF.hbaseBulkPutDF(this.tableName3, classOf[Student], keyNum = 2) // 方式二: // this.fire.hbaseBulkPutDF(this.tableName2, studentDF, classOf[Student]) } /** * 使用bulk的方式将Dataset写入到hbase */ def testHbaseBulkPutDS: Unit = { // 方式一:将DataFrame的数据写入到hbase中 val rdd = this.fire.createRDD(Student.newStudentList(), 2) val studentDataset = this.fire.createDataset(rdd)(Encoders.bean(classOf[Student])) // multiVersion=true表示以多版本形式插入 studentDataset.hbaseBulkPutDS(this.tableName5) // 方式二: // this.fire.hbaseBulkPutDS(this.tableName3, studentDataset) } /** * 使用bulk方式根据rowKey集合获取数据,并将结果集以RDD形式返回 */ def testHBaseBulkGetSeq: Unit = { println("===========testHBaseBulkGetSeq===========") // 方式一:使用rowKey集合读取hbase中的数据 val seq = Seq(1.toString, 2.toString, 3.toString, 5.toString, 6.toString) val studentRDD = this.fire.hbaseBulkGetSeq(this.tableName5, seq, classOf[Student]) studentRDD.foreach(println) // 方式二:使用this.fire.hbaseBulkGetRDD /*val studentRDD2 = this.fire.hbaseBulkGetSeq(this.tableName2, seq, classOf[Student]) studentRDD2.foreach(println)*/ } /** * 使用bulk方式根据rowKey获取数据,并将结果集以RDD形式返回 */ def testHBaseBulkGetRDD: Unit = { println("===========testHBaseBulkGetRDD===========") // 方式一:使用rowKey读取hbase中的数据,rowKeyRdd类型为String val rowKeyRdd = this.fire.createRDD(Seq(1.toString, 2.toString, 3.toString, 5.toString, 6.toString), 2) val studentRDD = rowKeyRdd.hbaseBulkGetRDD(this.tableName3, classOf[Student], keyNum = 2) studentRDD.foreach(println) // 方式二:使用this.fire.hbaseBulkGetRDD // val studentRDD2 = this.fire.hbaseBulkGetRDD(this.tableName2, rowKeyRdd, classOf[Student]) // studentRDD2.foreach(println) } /** * 使用bulk方式根据rowKey获取数据,并将结果集以DataFrame形式返回 */ def testHBaseBulkGetDF: Unit = { println("===========testHBaseBulkGetDF===========") // 方式一:使用rowKey读取hbase中的数据,rowKeyRdd类型为String val rowKeyRdd = this.fire.createRDD(Seq(1.toString, 2.toString, 3.toString, 5.toString, 6.toString), 2) val studentDF = rowKeyRdd.hbaseBulkGetDF(this.tableName5, classOf[Student]) studentDF.show(100, false) // 方式二:使用this.fire.hbaseBulkGetDF val studentDF2 = this.fire.hbaseBulkGetDF(this.tableName5, rowKeyRdd, classOf[Student]) studentDF2.show(100, false) } /** * 使用bulk方式根据rowKey获取数据,并将结果集以Dataset形式返回 */ def testHBaseBulkGetDS: Unit = { println("===========testHBaseBulkGetDS===========") // 方式一:使用rowKey读取hbase中的数据,rowKeyRdd类型为String val rowKeyRdd = this.fire.createRDD(Seq(1.toString, 2.toString, 3.toString, 5.toString, 6.toString), 2) val studentDS = rowKeyRdd.hbaseBulkGetDS(this.tableName5, classOf[Student]) studentDS.show(100, false) // 方式二:使用this.fire.hbaseBulkGetDF // val studentDS2 = this.fire.hbaseBulkGetDS(this.tableName2, rowKeyRdd, classOf[Student]) // studentDS2.show(100, false) } /** * 使用bulk方式进行scan,并将结果集映射为RDD */ def testHbaseBulkScanRDD: Unit = { println("===========testHbaseBulkScanRDD===========") // scan操作,指定rowKey的起止或直接传入自己构建的scan对象实例,返回类型为RDD[Student] val scanRDD = this.fire.hbaseBulkScanRDD2(this.tableName5, classOf[Student], "1", "6") scanRDD.foreach(println) } /** * 使用bulk方式进行scan,并将结果集映射为DataFrame */ def testHbaseBulkScanDF: Unit = { println("===========testHbaseBulkScanDF===========") // scan操作,指定rowKey的起止或直接传入自己构建的scan对象实例,返回类型为DataFrame val scanDF = this.fire.hbaseBulkScanDF2(this.tableName5, classOf[Student], "1", "6") scanDF.show(100, false) } /** * 使用bulk方式进行scan,并将结果集映射为Dataset */ def testHbaseBulkScanDS: Unit = { println("===========testHbaseBulkScanDS===========") // scan操作,指定rowKey的起止或直接传入自己构建的scan对象实例,返回类型为Dataset[Student] val scanDS = this.fire.hbaseBulkScanDS(this.tableName5, classOf[Student], HBaseConnector.buildScan("1", "6")) scanDS.show(100, false) } /** * 使用bulk方式批量删除指定的rowKey对应的数据 */ def testHBaseBulkDeleteRDD: Unit = { // 方式一:使用rowKey读取hbase中的数据,rowKeyRdd类型为String val rowKeyRdd = this.fire.createRDD(Seq(1.toString, 2.toString, 5.toString, 6.toString), 2) // 根据rowKey删除 rowKeyRdd.hbaseBulkDeleteRDD(this.tableName5) // 方式二:使用this.fire.hbaseBulkDeleteRDD // this.fire.hbaseBulkDeleteRDD(this.tableName1, rowKeyRdd) } /** * 使用bulk方式批量删除指定的rowKey对应的数据 */ def testHBaseBulkDeleteDS: Unit = { // 方式一:使用rowKey读取hbase中的数据,rowKeyRdd类型为String val rowKeyRdd = this.fire.createRDD(Seq(1.toString, 2.toString, 5.toString, 6.toString), 2) // 根据rowKey删除 this.fire.createDataset(rowKeyRdd)(Encoders.STRING).hbaseBulkDeleteDS(this.tableName5) // 方式二:使用this.fire.hbaseBulkDeleteDS // this.fire.hbaseBulkDeleteDS(this.tableName1, rowKeyRdd) } /** * Spark处理过程 * 注:此方法会被自动调用 */ override def process: Unit = { this.testHBaseBulkDeleteRDD HBaseConnector.truncateTable(this.tableName3, keyNum = 2) HBaseConnector.truncateTable(this.tableName5) // this.testHBaseBulkDeleteDS // this.testHbaseBulkPutRDD this.testHbaseBulkPutDF this.testHbaseBulkPutDS println("=========get========") this.testHBaseBulkGetRDD this.testHBaseBulkGetDF this.testHBaseBulkGetDS this.testHBaseBulkGetSeq println("=========scan========") this.testHbaseBulkScanRDD this.testHbaseBulkScanDF this.testHbaseBulkScanDS } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/hbase/HiveQL.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.hbase /** * Hive sql * * @author ChengLong 2019-1-16 09:53:45 * @contact Fire框架技术交流群(钉钉):35373471 */ object HiveQL { /** * 执行order main sql * @param tableName * @return */ def saveMainOrder(tableName: String): String = { s""" |select |gtid, |logFile, |offset, |op_type, |pos, |schema, |table, |msg_when, |after.*, |before.bill_code before_bill_code, |before.order_code before_order_code |from ${tableName} |where op_type<>'D' |and after.bill_code<>'' |and substr(table,0,6)='order_' |and substr(table,0,7)<>'order_r' """.stripMargin } /** * 执行delete order main sql * @param tableName * @return */ def deleteMainOrder(tableName: String): String = { s""" |select |gtid, |logFile, |offset, |op_type, |pos, |schema, |table, |msg_when, |before.* |from ${tableName} |where op_type='D' |and before.bill_code<>'' |and before.order_create_date>'2018-06-01' |and substr(table,0,6)='order_' |and substr(table,0,7)<>'order_r' """.stripMargin } /** * 执行save replica order sql * @param tableName * @return */ def saveReplicaOrder(tableName: String): String = { s""" |select |gtid, |logFile, |offset, |op_type, |pos, |schema, |table, |msg_when, |after.*, |before.bill_code before_bill_code, |before.order_code before_order_code |from ${tableName} |where op_type<>'D' |and after.bill_code<>'' |and substr(table,0,7)='order_r' """.stripMargin } /** * 执行delete replica order sql * @param tableName * @return */ def deleteReplicaOrder(tableName: String): String = { s""" |select |gtid, |logFile, |offset, |op_type, |pos, |schema, |table, |msg_when, |before.* |from ${tableName} |where op_type='D' |and before.order_create_date>'2018-06-01' |and before.bill_code<>'' |and substr(table,0,7)='order_r' """.stripMargin } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/hive/HiveClusterReader.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.hive import com.zto.fire.core.anno.connector.Hive import com.zto.fire.spark.SparkCore /** * 本示例用于演示spark读取不同hive集群,配置文件请见 HiveClusterReader.properties,继承自SparkCore表示是一个离线的spark程序 * 如果需要使用不同的hive集群,只需在该类同名的配置文件中加一下配置即可:hive.cluster=streaming,表示读取180实时集群的hive元数据 * * @author ChengLong 2019-5-17 10:39:19 * @contact Fire框架技术交流群(钉钉):35373471 */ @Hive("test") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object HiveClusterReader extends SparkCore { override def process: Unit = { // spark为sparkSession的实例,已经在init()中完成初始化,可以直接通过this.fire或this.spark方式调用 sql("use tmp") sql("show tables").show(100, false) } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/hive/HiveMetadataTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.hive import com.zto.fire.common.anno.Config import com.zto.fire.core.anno.connector.{Hive, Kafka} import com.zto.fire.spark.SparkCore /** * 基于Fire进行Spark Streaming开发 * * @contact Fire框架技术交流群(钉钉):35373471 */ @Config( """ |spark.sql.statistics.size.autoUpdate.enabled=true |""") @Hive("test") // 配置消费的kafka信息 @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") object HiveMetadataTest extends SparkCore { val sourceTable = "ods.mdb_md_dbs" val partitionTable = "dw.mdb_md_dbs_fire_orc" val multiPartitionTable = "tmp.mdb_md_dbs_fire_multi_partition_orc" val bucketTable = "tmp.mdb_md_dbs_fire_bucket" override def process: Unit = { this.testSinglePartitionTable // this.testMultiPartitionTable // this.testNoPartitionTable sql( s""" |INSERT INTO ${bucketTable} |SELECT * FROM VALUES(1,1,1) |""".stripMargin) } /** * 测试分区表更新hive元数据用例 */ def testMultiPartitionTable: Unit = { sql( s""" |insert into table ${multiPartitionTable} partition(ds, city) select *,'sh' as city from dw.mdb_md_dbs where ds='20211001' limit 100 |""".stripMargin) (1 to 3).foreach(x => { sql( s""" |insert into table ${multiPartitionTable} partition(ds, city) select *,'sh' as city from dw.mdb_md_dbs where ds='20211001' limit 100 |""".stripMargin) }) (1 to 3).foreach(x => { sql( s""" |insert overwrite table ${multiPartitionTable} partition(ds, city) select *,'bj' as city from dw.mdb_md_dbs where ds='20211001' limit 100 |""".stripMargin) }) } /** * 测试非分区表更新hive元数据用例 */ def testNoPartitionTable: Unit = { (1 to 3).foreach(x => { // orc非分区表 sql( s""" |insert into table dw.mdb_md_dbs_fire_orc_nopart select * from ${sourceTable} where ds='20190619' limit 10 |""".stripMargin) sql( """ |select count(1) from dw.mdb_md_dbs_fire_orc_nopart |""".stripMargin).show() // text非分区表 sql( """ |insert into table tmp.mdb_md_dbs_fire_txt partition(ds) select * from tmp.mdb_md_dbs_fire where ds='20190620' limit 10 |""".stripMargin) }) } /** * 测试分区表更新hive元数据用例 */ def testSinglePartitionTable: Unit = { (1 to 3).foreach(_ => { sql( """ |insert into table tmp.mdb_md_dbs_fire_txt partition(ds) select * from tmp.mdb_md_dbs_fire where ds='20190619' limit 10 |""".stripMargin) }) // orc分区表 sql(s"""drop table if exists ${partitionTable}2""") sql( s""" |create table if not exists ${partitionTable}2 like ${partitionTable} |""".stripMargin) sql( s""" |insert into table ${partitionTable}2 partition(ds) select * from dw.mdb_md_dbs where ds='20211001' limit 100 |""".stripMargin) var partition = 20211002 (1 to 3).foreach(x => { sql(s"""alter table ${partitionTable}2 PARTITION (ds='20211001') RENAME TO PARTITION (ds='${partition}')""") partition = partition + 1 sql( s""" |insert into table ${partitionTable}2 partition(ds) select * from dw.mdb_md_dbs where ds='20211001' limit 100 |""".stripMargin) }) (1 to 3).foreach(x => { sql( s""" |insert into table ${partitionTable}2 partition(ds) select * from ${partitionTable}2 |""".stripMargin) }) } val jdbc = """ |use hive; |-- orc分区表 |select * from PARTITION_PARAMS where PART_ID in ( | select PART_ID from PARTITIONS p where TBL_ID = (SELECT TBL_ID FROM TBLS t where t.TBL_NAME = 'mdb_md_dbs_fire_orc2') |); | |SELECT * from TABLE_PARAMS t where t.TBL_ID = (SELECT TBL_ID FROM TBLS t where t.TBL_NAME = 'mdb_md_dbs_fire_orc') | |-- orc非分区表 |SELECT * from TABLE_PARAMS t where t.TBL_ID = (SELECT TBL_ID FROM TBLS t where t.TBL_NAME = 'mdb_md_dbs_fire_orc_nopart') | |SELECT * from TABLE_PARAMS t where t.TBL_ID = (SELECT TBL_ID FROM TBLS t where t.TBL_NAME = 'mdb_md_dbs_fire_txt') | | |-- textfile分区表 |select * from PARTITION_PARAMS where PART_ID = ( | select PART_ID from PARTITIONS p where TBL_ID = (SELECT TBL_ID FROM TBLS t where t.TBL_NAME = 'mdb_md_dbs_fire_txt') |); |""".stripMargin } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/hive/HiveRW.scala ================================================ package com.zto.fire.examples.spark.hive import com.zto.fire._ import com.zto.fire.common.util.JSONUtils import com.zto.fire.core.anno.connector.{Hive, Kafka} import com.zto.fire.examples.bean.Student import com.zto.fire.spark.SparkStreaming import org.apache.spark.sql.DataFrame /** * 基于Fire进行Spark Streaming开发 * * @contact Fire框架技术交流群(钉钉):35373471 */ @Hive("test") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object HiveRW extends SparkStreaming { // 消息格式 // {"age":16,"className":"Student","createTime":"2020-08-03 17:23:05","id":6,"length":15.0,"name":"root","sex":true} // {"age":16,"className":"Student","createTime":"2020-08-03 17:23:05","id":6,"length":15.0,"name":"root","sex":true} override def process: Unit = { // this.ddl this.streaming // this.batch } /** * spark core模式 */ def batch: Unit = { val df = this.fire.createDataFrame(Student.newStudentList(), classOf[Student]) insert(df) } /** * streaming模式 */ def streaming: Unit = { val dstream = this.fire.createKafkaDirectStream() dstream.map(t => JSONUtils.parseObject[Student](t.value())).foreachRDD(rdd => { val df = this.fire.createDataFrame(rdd, classOf[Student]) insert(df) }) } /** * 创建表 */ def ddl: Unit = { sql( """ |drop table if exists tmp.baseorganize_fire |""".stripMargin) sql( """ |create table tmp.baseorganize_fire ( | id bigint, | name string, | age int |) partitioned by (ds string) |row format delimited fields terminated by '/t' |""".stripMargin) } /** * 动态分区写入 */ def insert(df: DataFrame): Unit = { sql("set hive.exec.dynamic.partition = true") sql("set hive.exec.dynamic.partition.mode=nonstrict") df.createOrReplaceTempView("t_student") sql( """ |insert into table tmp.baseorganize_fire |select | id, | name, | age, | '20220221' as ds |from t_student |""".stripMargin) sql( """ |select | *, | count(1) over() |from tmp.baseorganize_fire |""".stripMargin).show(3, false) } override def main(args: Array[String]): Unit = { this.init(10, false) } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/jdbc/JdbcStreamingTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.jdbc import com.zto.fire._ import com.zto.fire.core.anno.connector.Kafka import com.zto.fire.jdbc.JdbcConnector import com.zto.fire.spark.SparkStreaming import com.zto.fire.spark.anno.Streaming /** * @contact Fire框架技术交流群(钉钉):35373471 */ @Streaming(10) @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object JdbcStreamingTest extends SparkStreaming { val tableName = "spark_test" /** * Streaming的处理过程强烈建议放到process中,保持风格统一 * 注:此方法会被自动调用,在以下两种情况下,必须将逻辑写在process中 * 1. 开启checkpoint * 2. 支持streaming热重启(可在不关闭streaming任务的前提下修改batch时间) */ override def process: Unit = { val dstream = this.fire.createKafkaDirectStream() dstream.repartition(5).foreachRDD(rdd => { rdd.foreachPartition(it => { val sql = s"select id from $tableName limit 1" val retVal = JdbcConnector.executeQuery(sql, callback = _ => 1) logInfo("查询结果:" + retVal) }) }) } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/jdbc/JdbcTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.jdbc import com.zto.fire._ import com.zto.fire.common.util.{DateFormatUtils, JSONUtils} import com.zto.fire.core.anno.connector.{Jdbc, Jdbc2} import com.zto.fire.examples.bean.Student import com.zto.fire.jdbc.JdbcConnector import com.zto.fire.spark.SparkCore import com.zto.fire.spark.util.SparkUtils import org.apache.spark.sql.SaveMode /** * Spark jdbc操作 * * @author ChengLong 2019-6-17 15:17:38 * @contact Fire框架技术交流群(钉钉):35373471 */ @Jdbc(url = "jdbc:mysql://mysql-server:3306/fire", username = "root", password = "root") @Jdbc2(url = "jdbc:mysql://mysql-server:3306/fire", username = "root", password = "root") object JdbcTest extends SparkCore { lazy val tableName = "spark_test" lazy val tableName2 = "t_cluster_info" lazy val tableName3 = "t_cluster_status" /** * 使用jdbc方式对关系型数据库进行增删改操作 */ def testJdbcUpdate: Unit = { val timestamp = DateFormatUtils.formatCurrentDateTime() // 执行insert操作 val insertSql = s"INSERT INTO $tableName (name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" this.fire.jdbcUpdate(insertSql, Seq("admin", 12, timestamp, 10.0, 1)) // 更新配置文件中指定的第二个关系型数据库 this.fire.jdbcUpdate(insertSql, Seq("admin", 12, timestamp, 10.0, 1), keyNum = 2) // 执行更新操作 val updateSql = s"UPDATE $tableName SET name=? WHERE id=?" this.fire.jdbcUpdate(updateSql, Seq("root", 1)) // 执行批量操作 val batchSql = s"INSERT INTO $tableName (name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" this.fire.jdbcBatchUpdate(batchSql, Seq(Seq("spark1", 21, timestamp, 100.123, 1), Seq("flink2", 22, timestamp, 12.236, 0), Seq("flink3", 22, timestamp, 12.236, 0), Seq("flink4", 22, timestamp, 12.236, 0), Seq("flink5", 27, timestamp, 17.236, 0))) // 执行批量更新 this.fire.jdbcBatchUpdate(s"update $tableName set sex=? where id=?", Seq(Seq(1, 1), Seq(2, 2), Seq(3, 3), Seq(4, 4), Seq(5, 5), Seq(6, 6))) // 方式一:通过this.fire方式执行delete操作 val sql = s"DELETE FROM $tableName WHERE id=?" this.fire.jdbcUpdate(sql, Seq(2)) // 方式二:通过JdbcConnector.executeUpdate // 同一个事务 /*val connection = JdbcConnector.getConnection(keyNum = 2) this.fire.jdbcBatchUpdate("insert", connection = connection, commit = false, closeConnection = false) this.fire.jdbcBatchUpdate("delete", connection = connection, commit = false, closeConnection = false) this.fire.jdbcBatchUpdate("update", connection = connection, commit = true, closeConnection = true)*/ } /** * 使用jdbc方式对关系型数据库进行查询操作 */ def testJdbcQuery: Unit = { val sql = s"select * from $tableName where id in (?, ?, ?)" // 执行sql查询,并对查询结果集进行处理 this.fire.jdbcQuery(sql, Seq(1, 2, 3), callback = rs => { while (rs.next()) { // 对每条记录进行处理 println("driver=> id=" + rs.getLong(1)) } 1 }) // 将查询结果集以List[JavaBean]方式返回 val list = this.fire.jdbcQueryList(sql, Seq(1, 2, 3), classOf[Student]) // 方式二:使用JdbcConnector list.foreach(x => println(JSONUtils.toJSONString(x))) // 将结果集封装到RDD中 val rdd = this.fire.jdbcQueryRDD(sql, Seq(1, 2, 3)) rdd.printEachPartition // 将结果集封装到DataFrame中 val df = this.fire.jdbcQueryDF(sql, Seq(1, 2, 3)) df.show(10, false) } /** * 使用spark方式对表进行数据加载操作 */ def testTableLoad: Unit = { // 一次加载整张的jdbc小表,注:大表严重不建议使用该方法 this.fire.jdbcTableLoadAll(this.tableName).show(100, false) // 根据指定分区字段的上下边界分布式加载数据 this.fire.jdbcTableLoadBound(this.tableName, "id", 1, 10, 2).show(100, false) val where = Array[String]("id >=1 and id <=3", "id >=6 and id <=9", "name='root'") // 根据指定的条件进行数据加载,条件的个数决定了load数据的并发度 this.fire.jdbcTableLoad(tableName, where).show(100, false) } /** * 使用spark方式批量写入DataFrame数据到关系型数据库 */ def testTableSave: Unit = { // 批量将DataFrame数据写入到对应结构的关系型表中 val df = this.fire.createDataFrame(Student.newStudentList(), classOf[Student]) // 第二个参数默认为SaveMode.Append,可以指定SaveMode.Overwrite df.jdbcTableSave(this.tableName, SaveMode.Overwrite) // 利用sparkSession方式将DataFrame数据保存到配置的第二个数据源中 this.fire.jdbcTableSave(df, this.tableName, SaveMode.Overwrite, keyNum = 2) } /** * 将DataFrame数据写入到关系型数据库中 */ def testDataFrameSave: Unit = { val df = this.fire.createDataFrame(Student.newStudentList(), classOf[Student]) val insertSql = s"INSERT INTO spark_test(name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" // 指定部分DataFrame列名作为参数,顺序要对应sql中问号占位符的顺序,batch用于指定批次大小,默认取spark.db.jdbc.batch.size配置的值 df.jdbcBatchUpdate(insertSql, Seq("name", "age", "createTime", "length", "sex"), batch = 100) this.fire.jdbcTableLoadAll(this.tableName).show(100, false) df.createOrReplaceTempViewCache("student") val sqlDF = sql("select name, age, createTime from student where id>=1").repartition(1) // 若不指定字段,则默认传入当前DataFrame所有列,且列的顺序与sql中问号占位符顺序一致 sqlDF.jdbcBatchUpdate("insert into spark_test(name, age, createTime) values(?, ?, ?)", keyNum = 2) this.fire.jdbcTableLoadAll(this.tableName, keyNum = 2).show(100, false) // 等同以上方式 // this.fire.jdbcBatchUpdateDF(sqlDF, "insert into spark_test(name, age, createTime) values(?, ?, ?)") } /** * 在executor中执行jdbc操作 */ def testExecutor: Unit = { JdbcConnector.executeQuery(s"select id from $tableName limit 1", null, callback = _ => { Thread.sleep(1000) }) JdbcConnector.executeQuery(s"select id from $tableName limit 1", null, callback = _ => { }, keyNum = 2) this.logger.info("driver sql执行成功") val rdd = this.fire.createRDD(1 to 3, 3) rdd.foreachPartition(it => { it.foreach(i => { JdbcConnector.executeQuery(s"select id from $tableName limit 1", null, callback = _ => { }) }) this.logger.info("sql执行成功") }) this.logConf val rdd2 = this.fire.createRDD(1 to 3, 3) rdd2.foreachPartition(it => { it.foreach(i => { JdbcConnector.executeQuery(s"select id from $tableName limit 1", null, callback = _ => { this.logConf 1 }, keyNum = 2) this.logger.info("sql执行成功") }) }) } /** * 用于测试分布式配置 */ def logConf: Unit = { println(s"executorId=${SparkUtils.getExecutorId} hello.world=" + this.conf.getString("hello.world", "not_found")) println(s"executorId=${SparkUtils.getExecutorId} hello.world.flag=" + this.conf.getBoolean("hello.world.flag", false)) println(s"executorId=${SparkUtils.getExecutorId} hello.world.flag2=" + this.conf.getBoolean("hello.world.flag", false, keyNum = 2)) } override def process: Unit = { // 测试环境测试 this.testJdbcUpdate this.testJdbcQuery // this.testJdbcUpdate /*this.testJdbcUpdate this.testJdbcQuery this.testTableLoad this.testTableSave this.testDataFrameSave*/ // 测试配置分发 this.testExecutor Thread.sleep(100000) } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/lineage/DataSourceTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.lineage import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.examples.bean.Student import com.zto.fire.spark.SparkCore import org.apache.spark.sql.SaveMode /** * Spark DataSource API示例 * * @contact Fire框架技术交流群(钉钉):35373471 */ @Config( """ |# 一、hudi datasource,全部基于配置文件进行配置 |spark.datasource.format=org.apache.hudi |spark.datasource.saveMode=Append |# 用于区分调用save(path)还是saveAsTable |spark.datasource.isSaveTable=false |# 传入到底层save或saveAsTable方法中 |spark.datasource.saveParam=/user/hive/warehouse/hudi.db/hudi_bill_event_test | |# 以spark.datasource.options.为前缀的配置用于配置hudi相关的参数,可覆盖代码中同名的配置 |spark.datasource.options.hoodie.datasource.write.recordkey.field=id |spark.datasource.options.hoodie.datasource.write.precombine.field=id |spark.datasource.options.hoodie.datasource.write.partitionpath.field=ds |spark.datasource.options.hoodie.table.name=hudi.hudi_bill_event_test |spark.datasource.options.hoodie.datasource.write.hive_style_partitioning=true |spark.datasource.options.hoodie.datasource.write.table.type=MERGE_ON_READ |spark.datasource.options.hoodie.insert.shuffle.parallelism=128 |spark.datasource.options.hoodie.upsert.shuffle.parallelism=128 |spark.datasource.options.hoodie.fail.on.timeline.archiving=false |spark.datasource.options.hoodie.clustering.inline=true |spark.datasource.options.hoodie.clustering.inline.max.commits=8 |spark.datasource.options.hoodie.clustering.plan.strategy.target.file.max.bytes=1073741824 |spark.datasource.options.hoodie.clustering.plan.strategy.small.file.limit=629145600 |spark.datasource.options.hoodie.clustering.plan.strategy.daybased.lookback.partitions=2 | |# 二、配置第二个数据源,以数字后缀作为区分,部分使用配置文件进行配置 |spark.datasource.format2=org.apache.hudi2 |spark.datasource.saveMode2=Overwrite |# 用于区分调用save(path)还是saveAsTable |spark.datasource.isSaveTable2=false |# 传入到底层save或saveAsTable方法中 |spark.datasource.saveParam2=/user/hive/warehouse/hudi.db/hudi_bill_event_test2 | |# 三、配置第三个数据源,用于代码中进行read操作 |spark.datasource.format3=org.apache.hudi3 |spark.datasource.loadParam3=/user/hive/warehouse/hudi.db/hudi_bill_event_test3 |spark.datasource.options.hoodie.datasource.write.recordkey.field3=id3 |""") object DataSourceTest extends SparkCore { override def process: Unit = { val ds = this.fire.createDataFrame(Student.newStudentList(), classOf[Student]) ds.createOrReplaceTempView("test") val dataFrame = sql("select * from test") // 一、 dataFrame.write.format.mode.save中的所有参数均可通过配置文件指定 // dataFrame.writeEnhance() // 二、 dataFrame.write.mode.save中部分参数通过配置文件指定,或全部通过方法硬编码指定 val savePath = "/user/hive/warehouse/hudi.db/hudi_bill_event_test" // 如果代码中与配置文件中均指定了options,则相同的options配置文件优先级更高,不同的option均生效 val options = Map( "hoodie.datasource.write.recordkey.field" -> "id", "hoodie.datasource.write.precombine.field" -> "id" ) // 使用keyNum标识读取配置文件中不同配置后缀的options信息 // dataFrame.writeEnhance("org.apache.hudi", SaveMode.Append, savePath, options = options, keyNum = 2) // read.format.mode.load(path) this.fire.readEnhance(keyNum = 3) } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/lineage/LineageTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.lineage import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.common.util.{DateFormatUtils, JSONUtils, ThreadUtils} import com.zto.fire.core.anno.connector._ import com.zto.fire.core.anno.lifecycle.{Process, Step1} import com.zto.fire.examples.bean.Student import com.zto.fire.hbase.HBaseConnector import com.zto.fire.spark.SparkStreaming import com.zto.fire.spark.anno.Streaming import com.zto.fire.spark.sync.SparkLineageAccumulatorManager import java.util.concurrent.TimeUnit /** * 基于Fire进行Spark Streaming开发 * * @contact Fire框架技术交流群(钉钉):35373471 */ @HBase("test") @Hive("test") @Config("""fire.lineage.run.initialDelay=10""") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") @Streaming(interval = 10, concurrent = 2, backpressure = true, maxRatePerPartition = 100) @RocketMQ(brokers = "bigdata_test", topics = "fire2", groupId = "fire") @Jdbc(url = "jdbc:mysql://mysql-server:3306/fire", username = "root", password = "root") object LineageTest extends SparkStreaming { private val hbaseTable = "fire_test_1" private lazy val tableName = "spark_test" val multiPartitionTable = "tmp.mdb_md_dbs_fire_multi_partition_orc" @Process def source: Unit = { this.fire.createKafkaDirectStream().print() sql( s""" |insert into table ${multiPartitionTable} partition(ds, city) select *,'sh' as city from dw.mdb_md_dbs where ds='20211001' limit 100 |""".stripMargin) val dstream = this.fire.createRocketMqPullStream().map(t => JSONUtils.toJSONString(t)) dstream.foreachRDD(rdd => { rdd.foreachPartition(it => { val timestamp = DateFormatUtils.formatCurrentDateTime() val insertSql = s"INSERT INTO $tableName (name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" this.fire.jdbcUpdate(insertSql, Seq("admin", 12, timestamp, 10.0, 1)) HBaseConnector.get[Student](hbaseTable, classOf[Student], Seq("1")) }) val studentList = Student.newStudentList() val studentDF = this.fire.createDataFrame(studentList, classOf[Student]) // 每个批次插100条 studentDF.hbasePutDF(this.hbaseTable, classOf[Student]) }) dstream.print() } @Step1("周期性执行") def test: Unit = { ThreadUtils.scheduleAtFixedRate({ println(s"累加器值:" + JSONUtils.toJSONString(SparkLineageAccumulatorManager.getValue)) }, 0, 60, TimeUnit.SECONDS) } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/lineage/SparkCoreLineageTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.lineage import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.common.util.{DateFormatUtils, JSONUtils, ThreadUtils} import com.zto.fire.core.anno.connector.{HBase, Jdbc, Kafka, RocketMQ} import com.zto.fire.examples.bean.Student import com.zto.fire.hbase.HBaseConnector import com.zto.fire.spark.SparkCore import com.zto.fire.spark.sync.SparkLineageAccumulatorManager import java.util.concurrent.TimeUnit /** * 基于Fire进行Spark Streaming开发 * * @contact Fire框架技术交流群(钉钉):35373471 */ @HBase("test") @Config( """ |fire.lineage.run.initialDelay=10 |fire.shutdown.auto.exit=false |""") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") @RocketMQ(brokers = "bigdata_test", topics = "fire2", groupId = "fire") @Jdbc(url = "jdbc:mysql://mysql-server:3306/fire", username = "root", password = "root") object SparkCoreLineageTest extends SparkCore { private val hbaseTable = "fire_test_1" private lazy val tableName = "spark_test" override def process: Unit = { (1 to 10).foreach(x => { val df = this.fire.createDataFrame(Student.newStudentList(), classOf[Student]) df.rdd.foreachPartition(it => { val timestamp = DateFormatUtils.formatCurrentDateTime() val insertSql = s"INSERT INTO $tableName (name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" this.fire.jdbcUpdate(insertSql, Seq("admin", 12, timestamp, 10.0, 1)) HBaseConnector.get[Student](hbaseTable, classOf[Student], Seq("1")) }) // 每个批次插100条 df.hbasePutDF(this.hbaseTable, classOf[Student]) Thread.sleep(10000) }) val df = this.fire.createDataFrame(Student.newStudentList(), classOf[Student]) df.rdd.foreachPartition(it => { val a = 1 / 0 }) } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/module/ArthasTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.module import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.common.util.JSONUtils import com.zto.fire.core.anno.connector.{Hive, Kafka} import com.zto.fire.examples.bean.Student import com.zto.fire.spark.SparkStreaming import com.zto.fire.spark.anno.Streaming /** * Spark Streaming集成Arthas工具测试 * * @contact Fire框架技术交流群(钉钉):35373471 */ @Config( """ |# 直接从配置文件中拷贝过来即可 |fire.acc.timer.max.size=30 |fire.acc.log.max.size=20 |fire.analysis.arthas.enable=true |fire.log.level.conf.org.apache.spark=warn |fire.analysis.arthas.container.enable=true |fire.analysis.arthas.conf.arthas.username=spark |""") @Hive("test") @Streaming(20) @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object ArthasTest extends SparkStreaming { override def process: Unit = { val dstream = this.fire.createKafkaDirectStream() // 用于模拟性能问题 new Thread(new Runnable { override def run(): Unit = { while (true) { printConf } } }).start() // 至少一次的语义保证,处理成功自动提交offset,处理失败会重试指定次数,如果仍失败则任务退出 dstream.foreachRDDAtLeastOnce(rdd => { val studentRDD = rdd.map(t => { printConf JSONUtils.parseObject[Student](t.value()) }).repartition(2) val insertSql = s"INSERT INTO spark_test(name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" println("kafka.brokers.name=>" + this.conf.getString("kafka.brokers.name")) studentRDD.toDF().jdbcBatchUpdate(insertSql, Seq("name", "age", "createTime", "length", "sex"), batch = 100) })(reTry = 5, exitOnFailure = true) this.spark.sql( """ |SELECT | * |FROM rtdb.zto_ssmx_bill_detail |WHERE | order_create_date>= cast( date_add(current_date,-10) as timestamp ) | AND order_create_date< cast( date_add(current_date,1) as timestamp ) |""".stripMargin).show(100000, false) } def printConf: Unit = { Thread.sleep(10000) println("================================") println("fire.thread.pool.size=" + this.conf.getInt("fire.thread.pool.size", -1)) println("fire.thread.pool.schedule.size=" + this.conf.getInt("fire.thread.pool.schedule.size", -1)) println("fire.acc.timer.max.size=" + this.conf.getInt("fire.acc.timer.max.size", -1)) println("fire.acc.log.max.size=" + this.conf.getInt("fire.acc.log.max.size", -1)) println("fire.jdbc.query.partitions=" + this.conf.getInt("fire.jdbc.query.partitions", -1)) println("================================") } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/module/ExceptionTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.module import com.zto.fire._ import com.zto.fire.core.anno.connector._ import com.zto.fire.spark.SparkCore import com.zto.fire.spark.anno.Streaming @Hive("test") @Streaming(interval = 10) @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") object ExceptionTest extends SparkCore { override def process: Unit = { // this.testSqlException this.testSqlException } /** * 测试SQL异常捕获 */ def testSqlException: Unit = { sql( """ |use dim; |select ,'sh' as city from dw.mdb_md_dbs where ds='20211001' limit 100; |""".stripMargin).print() } /** * 测试API的异常捕获 */ def testApiException: Unit = { val dstream = this.fire.createKafkaDirectStream() dstream.map(t => { val a = 1 / 0 t }).print() } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/schedule/ScheduleTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.schedule import com.zto.fire._ import com.zto.fire.common.anno.{Config, Scheduled} import com.zto.fire.common.util.DateFormatUtils import com.zto.fire.spark.SparkStreaming import com.zto.fire.spark.util.SparkUtils /** * 用于测试定时任务 * * @author ChengLong 2019年11月5日 17:27:20 * @since 0.3.5 * @contact Fire框架技术交流群(钉钉):35373471 */ @Config( """ |spark.fire.task.schedule.enable = true |# 定时任务黑名单,配置方法名,多个以逗号分隔,配置的方法将不再被定时任务定时拉起 |spark.fire.scheduler.blacklist = jvmMonitor,setConf2,registerAcc |""") object ScheduleTest extends SparkStreaming { /** * 声明了@Scheduled注解的方法是定时任务方法,会周期性执行 * * @cron cron表达式 * @scope 默认同时在driver端和executor端执行,如果指定了driver,则只在driver端定时执行 * @concurrent 上一个周期定时任务未执行完成时是否允许下一个周期任务开始执行 * @startAt 用于指定第一次开始执行的时间 * @initialDelay 延迟多长时间开始执行第一次定时任务 */ @Scheduled(cron = "0/5 * * * * ?", scope = "driver", concurrent = false, startAt = "2021-01-21 11:30:00", initialDelay = 60000) def loadTable: Unit = { this.logger.info("更新维表动作") } /** * 只在driver端执行,不允许同一时刻同时执行该方法 * startAt用于指定首次执行时间 */ @Scheduled(cron = "0/5 * * * * ?", scope = "all", concurrent = false) def test2: Unit = { this.logger.info("executorId=" + SparkUtils.getExecutorId + "====方法 test2() 每5秒执行====" + DateFormatUtils.formatCurrentDateTime()) } // 每天凌晨4点01将锁标志设置为false,这样下一个批次就可以先更新维表再执行sql @Scheduled(cron = "0 1 4 * * ?") def updateTableJob: Unit = this.lock.compareAndSet(true, false) // 用于缓存变更过的维表,只有当定时任务将标记设置为可更新时才会真正拉取最新的表 def cacheTable: Unit = { // 加载完成维表以后上锁 if (this.lock.compareAndSet(false, true)) { this.fire.uncache("test") this.fire.cacheTables("test") } } override def process: Unit = { // 用于注册其他类下带有@Scheduler标记的方法 this.registerSchedule(new Tasks) // 重复注册的任务会自动去重 this.registerSchedule(new Tasks) // 更新并缓存维表动作,具体要根据锁的标记判断是否执行 this.cacheTable } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/schedule/Tasks.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.schedule import com.zto.fire.common.anno.Scheduled import com.zto.fire.common.util.DateFormatUtils import com.zto.fire.spark.util.SparkUtils /** * 定时任务注册类 * 1. 可序列化 * 2. 方法不带任何参数 * * @author ChengLong 2019年11月5日 17:29:35 * @since 0.3.5 * @contact Fire框架技术交流群(钉钉):35373471 */ class Tasks extends Serializable { /** * 只在driver端执行,不允许同一时刻同时执行该方法 * startAt用于指定首次执行时间 */ @Scheduled(cron = "0/15 * * * * ?", scope = "all", concurrent = false) def test5: Unit = { println("executorId=" + SparkUtils.getExecutorId + "====方法 test5() 每15秒执行====" + DateFormatUtils.formatCurrentDateTime()) } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/sql/LoadTestSQL.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.sql /** * 用于集群压测程序的SQL * @author ChengLong 2019年10月25日 13:32:19 * @contact Fire框架技术交流群(钉钉):35373471 */ object LoadTestSQL { def jsonParseSQL: String = { """ |select | after.CAR_SIGN_CODE, | nvl(after.CAR_SIGN_CODE_OLD,'') CAR_SIGN_CODE_OLD, | after.CAR_DATE, | after.SCAN_SITE_ID, | after.PRE_OR_NEX_STA_ID, | after.PRE_OR_NEXT_STATION, | from_unixtime( | unix_timestamp(after.CAR_DATE,'yyyy-MM-dd HH:mm:ss'), | 'yyyy-MM-dd HH:mm:ss' | ) ARRIVE_DATE |from test t1 |left join dim_c2c_cost t2 |on after.SCAN_SITE_ID=t2.start_site_id |and after.PRE_OR_NEX_STA_ID=t2.end_site_id |and substr(after.CAR_DATE,12,2)=t2.hh24 |""".stripMargin } def loadSQL: String = { """ |select | f.site_name as site_name, | f.site_id as site_id, | f.collect_date as collect_date, | sum(f.rec_count) as rec_count, | sum(f.rec_weight) as rec_weight, | sum(f.send_count) as send_count, | sum(f.send_weight) as send_weight, | sum(f.send_bag_count) as send_bag_count, | sum(f.send_bag_bill_count) as send_bag_bill_count, | sum(f.send_bag_weight) as send_bag_weight, | sum(f.come_count) as come_count, | sum(f.come_weight) as come_weight, | sum(f.come_bag_count) as come_bag_count, | sum(f.come_bag_weight) as come_bag_weight, | sum(f.come_bag_bill_count) as come_bag_bill_count, | sum(f.disp_count) as disp_count, | sum(f.disp_weight) as disp_weight, | sum(f.sign_count) as sign_count, | f.ds as ds | from ( | select f.scan_site as site_name, | f.scan_site_id as site_id, | f.scan_date as collect_date, | count(f.bill_code) as rec_count, | sum(nvl(f.weight,0)) as rec_weight, | 0 as send_count, | 0 as send_weight, | 0 as send_bag_count, | 0 as send_bag_bill_count, | 0 as send_bag_weight, | 0 as come_count, | 0 as come_weight, | 0 as come_bag_count, | 0 as come_bag_bill_count, | 0 as come_bag_weight, | 0 as disp_count, | 0 as disp_weight, | 0 as sign_count, | f.ds as ds | from (select a.scan_site, | a.scan_site_id, | to_date(a.scan_date) as scan_date, | last_value(a.bill_code) over (partition by a.scan_site_id,a.bill_code order by a.scan_date desc) as bill_code, | max(a.weight) over (partition by a.scan_site_id,a.bill_code) as weight, | a.ds as ds | from dw.dw_zt_zto_scan_rec a | where a.ds>='20191020' | and a.ds<'20191021' | and a.scan_site_id>0 | ) f group by f.scan_site,f.scan_site_id,f.scan_date,f.ds | | union all | select f.scan_site as site_name,f.scan_site_id as site_id,f.scan_date as collect_date, | 0 as rec_count, | 0 as rec_weight, | count(f.bill_code) as send_count, | sum(f.weight) as send_weight, | 0 as send_bag_count, | 0 as send_bag_bill_count, | 0 as send_bag_weight, | 0 as come_count, | 0 as come_weight, | 0 as come_bag_count, | 0 as come_bag_bill_count, | 0 as come_bag_weight, | 0 as disp_count, | 0 as disp_weight, | 0 as sign_count, | f.ds as ds | from (select a.scan_site, | a.scan_site_id, | to_date(a.scan_date) as scan_date, | last_value(a.bill_code) over (partition by a.scan_site_id,a.bill_code order by a.scan_date desc) as bill_code, | max(a.weight) over (partition by a.scan_site_id,a.bill_code) as weight, | a.ds as ds | from dw.dw_zt_zto_scan_send a | where a.ds>='20191020' | and a.ds<'20191021' | and a.scan_site_id>0 | ) f group by f.scan_site,f.scan_site_id,f.scan_date,f.ds | union all | select f.scan_site as site_name,f.scan_site_id as site_id,f.scan_date as collect_date, | 0 as rec_count, | 0 as rec_weight, | 0 as send_count, | 0 as send_weight, | count(f.bill_code) as send_bag_count, | sum(d.bagbillsum) as send_bag_bill_count, | sum(nvl(f.weight,0)) as send_bag_weight, | 0 as come_count, | 0 as come_weight, | 0 as come_bag_count, | 0 as come_bag_bill_count, | 0 as come_bag_weight, | 0 as disp_count, | 0 as disp_weight, | 0 as sign_count, | f.ds as ds | from (select a.scan_site, | a.scan_site_id, | to_date(a.scan_date) as scan_date , | LAST_VALUE(a.bill_code) over (partition by a.scan_site_id,a.bill_code order by a.scan_date desc) as bill_code, | max(a.weight) over (partition by a.scan_site_id,a.bill_code) as weight, | a.ds as ds | from dw.dw_zt_zto_scan_send_bag a | where a.ds>='20191020' | and a.ds<'20191021' | and a.scan_site_id>0 | ) f | left join (select sum(bagbillsum) as bagbillsum, bill_code as owner_bag_no from dw.zto_bagbillsum_weight where ds>='20190920' | and ds<='20191020' | group by bill_code | ) d on d.owner_bag_no=f.bill_code | group by f.scan_site,f.scan_site_id,f.scan_date,f.ds | union all | select f.scan_site as site_name,f.scan_site_id as site_id,f.scan_date as collect_date, | 0 as rec_count, | 0 as rec_weight, | 0 as send_count, | 0 as send_weight, | 0 as send_bag_count, | 0 as send_bag_bill_count, | 0 as send_bag_weight, | count(f.bill_code) as come_count, | sum(f.weight) as come_weight, | 0 as come_bag_count, | 0 as come_bag_bill_count, | 0 as come_bag_weight, | 0 as disp_count, | 0 as disp_weight, | 0 as sign_count, | f.ds as ds | from (select a.scan_site, | a.scan_site_id, | to_date(a.scan_date) as scan_date, | last_value(a.bill_code) over (partition by a.scan_site_id,a.bill_code order by a.scan_date desc) as bill_code, | max(a.weight) over (partition by a.scan_site_id,a.bill_code) as weight, | a.ds as ds | from dw.dw_zt_zto_scan_come a | where a.ds>='20191020' | and a.ds<'20191021' | and a.scan_site_id>0 | ) f group by f.scan_site,f.scan_site_id,f.scan_date,f.ds | union all | select f.scan_site as site_name,f.scan_site_id as site_id,f.scan_date as collect_date, | 0 as rec_count, | 0 as rec_weight, | 0 as send_count, | 0 as send_weight, | 0 as send_bag_count, | 0 as send_bag_bill_count, | 0 as send_bag_weight, | 0 as come_count, | 0 as come_weight, | count(f.bill_code) as come_bag_count, | sum(bagbillsum) as come_bag_bill_count, | sum(nvl(f.weight,0)) as come_bag_weight, | 0 as disp_count, | 0 as disp_weight, | 0 as sign_count, | f.ds as ds | from (select a.scan_site, | a.scan_site_id, | to_date(a.scan_date) as scan_date, | last_value(a.bill_code) over (partition by a.scan_site_id,a.bill_code order by a.scan_date desc) as bill_code, | max(a.weight) over (partition by a.scan_site_id,a.bill_code) as weight, | a.ds as ds | from dw.dw_zt_zto_scan_come_bag a | where a.ds>='20191020' | and a.ds<'20191021' | and a.scan_site_id>0 | ) f | left join ( | select sum(bagbillsum) as bagbillsum,bill_code as owner_bag_no from dw.zto_bagbillsum_weight where ds>='20190920' | and ds<='20191020' | group by bill_code |) d on d.owner_bag_no=f.bill_code | group by f.scan_site,f.scan_site_id,f.scan_date,f.ds | union all | select f.scan_site as site_name,f.scan_site_id as site_id,f.scan_date as collect_date, | 0 as rec_count, | 0 as rec_weight, | 0 as send_count, | 0 as send_weight, | 0 as send_bag_count, | 0 as send_bag_bill_count, | 0 as send_bag_weight, | 0 as come_count, | 0 as come_weight, | 0 as come_bag_count, | 0 as come_bag_bill_count, | 0 as come_bag_weight, | count(f.bill_code) as disp_count, | sum(nvl(f.weight,0)) as disp_weight, | 0 as sign_count, | f.ds as ds | from (select a.scan_site, | a.scan_site_id, | to_date(a.scan_date) as scan_date, | last_value(a.bill_code) over (partition by a.scan_site_id,a.bill_code order by a.scan_date desc) as bill_code, | max(a.weight) over (partition by a.scan_site_id,a.bill_code) as weight, | a.ds as ds | from dw.dw_zt_zto_scan_disp a | where a.ds>='20191020' | and a.ds<'20191021' | and a.scan_site_id>0 | ) f group by f.scan_site,f.scan_site_id,f.scan_date,f.ds | union all | select | r.record_site as site_name, | r.record_site_id as site_id, | to_date(record_date) as collect_date, | 0 as rec_count, | 0 as rec_weight, | 0 as send_count, | 0 as send_weight, | 0 as send_bag_count, | 0 as send_bag_bill_count, | 0 as send_bag_weight, | 0 as come_count, | 0 as come_weight, | 0 as come_bag_count, | 0 as come_bag_bill_count, | 0 as come_bag_weight, | 0 as disp_count, | 0 as disp_weight, | count(r.bill_code) as sign_count, | r.ds as ds | from dw.dw_zt_zto_sign r | where r.ds>='20191020' | and r.ds<='20191021' | and r.record_site_id>0 | group by r.record_site,r.record_site_id,to_date(record_date),r.ds | ) f | group by f.site_name , | f.site_id , | f.collect_date, | f.ds |DISTRIBUTE BY rand() |""".stripMargin } def cacheDim: String = { """ |select cast(start_site_id as string), | cast(end_site_id as string), | substr(concat('0',cast(actual_start_date_hour as string)),-2,2) as hh24, | cast(c2c_hour_percent50_onway_hour as string) as cost_time |from ba.zy_tmp_center_onway_hour_configure """.stripMargin } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/sql/SparkSqlParseTest.scala ================================================ package com.zto.fire.examples.spark.sql import com.zto.fire.common.bean.TableIdentifier import com.zto.fire.common.util.{JSONUtils, ThreadUtils} import com.zto.fire.core.anno.connector.Hive import com.zto.fire.examples.bean.Student import com.zto.fire.spark.SparkCore import com.zto.fire.spark.sql.SparkSqlParser import com.zto.fire.spark.sync.SparkLineageAccumulatorManager import java.util.concurrent.TimeUnit /** * Spark SQL血缘解析工具 * * @contact Fire框架技术交流群(钉钉):35373471 */ @Hive("test") object SparkSqlParseTest extends SparkCore { override def process: Unit = { val ds = this.spark.createDataFrame(Student.newStudentList(), classOf[Student]) ds.createOrReplaceTempView("t_student") println("t_student -> " + SparkSqlParser.isHiveTable(TableIdentifier("t_student"))) println("tmp.baseuser ->" + SparkSqlParser.isHiveTable(TableIdentifier("tmp.baseuser"))) val select1 = """ |select count(*) |from (select * from st.st_fwzl_transfer_kpi_detail_month) a |left join (select biz_no,bill_code from dw.dw_kf_center_to_center_dispatch_delay where ds>='20210101') b |on a.bill_code=b.bill_code |""".stripMargin val select2 = """ |select bill_event_id,count(*) from hudi.hudi_bill_item group by bill_event_id |""".stripMargin val insertInto = """ |insert into ods.base select a,v from tmp.t_user t1 left join ods.test t2 on t1.id=t2.id |""".stripMargin val alterTableAddPartitionStatement = """ |alter table tmp.t_user add if not exists partition (ds='20210620', city = 'beijing') |""".stripMargin val dropTable = """ |drop table if exists tmp.test |""".stripMargin val renameTable = """ |alter table tmp.t_user rename to ods.t_user2 |""".stripMargin val dropPartition = """ |ALTER TABLE tmp.food DROP IF EXISTS PARTITION (ds='20151219', city = 'beijing') |""".stripMargin val renamePartition = """ |Alter table tmp.test partition (ds='201801', city='beijing') rename to partition(ds='202106', city='shanghai') |""".stripMargin val createTable = """ |CREATE TABLE `tmp.test`( | `dept_no` int, | `addr` string, | `tel` string) |partitioned by(ds string, city string) |ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' |""".stripMargin val createTableAsSelect = """ |create table if not exists tmp.zto_fire_test |select a.*,'sh' as city |from dw.mdb_md_dbs a left join student t on a.ds=t.name |where ds='20211001' limit 100 |""".stripMargin val dropDB = "drop database if exists tmp12" val insertOverwrite = "insert overwrite table dw.kwang_test partition(ds='202106', city='beijing') values(4,'zz')" val insertIntoAsSelect = """ |insert into zto_cockpit_site_target_ds |SELECT site_id,scan_date,scan_day, |SUM(a.rec_cnt) rec_cnt, |SUM(a.order_cnt) order_cnt, |SUM(a.disp_cnt) disp_cnt, |SUM(a.sign_cnt) sign_cnt, |SUM(a.ele_cnt) ele_cnt, |SUM(a.bag_cnt) bag_cnt |FROM ( |SELECT t1.site_id,t1.scan_date,t1.scan_day , |t1.cnt rec_cnt, |0 order_cnt, |0 disp_cnt, |0 sign_cnt, |t1.ele_cnt ele_cnt, |t1.bag_cnt bag_cnt |FROM ztkb.zto_cockpit_site_rec_ds t1 |WHERE t1.scan_day = '#date#' |UNION ALL |SELECT t2.site_id,t2.order_date scan_date,t2.order_day scan_day , |0 rec_cnt, |t2.cnt order_cnt, |0 disp_cnt, |0 sign_cnt, |0 ele_cnt, |0 bag_cnt |FROM ztkb.zto_cockpit_site_order_ds t2 |WHERE t2.order_day = '#date#' |UNION ALL |SELECT t3.site_id,t3.scan_date,t3.scan_day , |0 rec_cnt, |0 order_cnt, |t3.cnt disp_cnt, |0 sign_cnt, |0 ele_cnt, |0 bag_cnt |FROM ztkb.zto_cockpit_site_disp_ds t3 |WHERE t3.scan_day = '#date#' |UNION ALL |select t.record_site_id site_id,t.sign_date scan_date,t.sign_day scan_day, |0 rec_cnt, |0 order_cnt, |0 disp_cnt, |sum(t.cnt) sign_cnt, |0 ele_cnt, |0 bag_cnt |from ztkb.zto_cockpit_site_sign_ds t |where t.sign_day = '#date#' |group by t.record_site_id,t.sign_date,t.sign_day |) a |GROUP BY site_id,scan_date,scan_day """.stripMargin SparkSqlParser.sqlParser(select1) ThreadUtils.scheduleAtFixedRate({ println(s"累加器值:" + JSONUtils.toJSONString(SparkLineageAccumulatorManager.getValue) + "\n\n") }, 0, 10, TimeUnit.SECONDS) Thread.currentThread().join() } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/streaming/AtLeastOnceTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.streaming import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.common.util.JSONUtils import com.zto.fire.core.anno.connector.{Hive, Kafka} import com.zto.fire.examples.bean.Student import com.zto.fire.spark.SparkStreaming import com.zto.fire.spark.anno.Streaming /** * 基于Fire进行Spark Streaming开发 * * @contact Fire框架技术交流群(钉钉):35373471 */ @Config( """ |# 直接从配置文件中拷贝过来即可 |fire.acc.timer.max.size=30 |fire.acc.log.max.size=20 |fire.shutdown.auto.exit=true |""") @Hive("test") @Streaming(20) // spark streaming的批次时间 @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object AtLeastOnceTest extends SparkStreaming { override def process: Unit = { val dstream = this.fire.createKafkaDirectStream() // 至少一次的语义保证,处理成功自动提交offset,处理失败会重试指定次数,如果仍失败则任务退出 dstream.foreachRDDAtLeastOnce(rdd => { val studentRDD = rdd.map(t => JSONUtils.parseObject[Student](t.value())).repartition(2) val insertSql = s"INSERT INTO spark_test(name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" println("kafka.brokers.name=>" + this.conf.getString("kafka.brokers.name")) studentRDD.toDF().jdbcBatchUpdate(insertSql, Seq("name", "age", "createTime", "length", "sex"), batch = 1) }) } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/streaming/ConfigCenterTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.streaming import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.common.util.JSONUtils import com.zto.fire.core.anno.connector.Kafka import com.zto.fire.examples.bean.Student import com.zto.fire.spark.SparkStreaming import com.zto.fire.spark.anno.Streaming /** * 基于Fire进行Spark Streaming开发 * * @contact Fire框架技术交流群(钉钉):35373471 */ @Config( """ |fire.acc.timer.max.size=30 |fire.acc.log.max.size=20 |fire.conf.test=java |""") @Streaming(20) // spark streaming的批次时间 @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object ConfigCenterTest extends SparkStreaming { /** * 业务逻辑代码,会被fire自动调用 */ override def process: Unit = { val dstream = this.fire.createKafkaDirectStream() this.printConf dstream.foreachRDD(rdd => { rdd.map(t => { printConf JSONUtils.parseObject[Student](t.value()) }).repartition(2).count() }) } /** * 配置信息打印 * * ================================ * fire.thread.pool.size=6 * fire.thread.pool.schedule.size=5 * fire.acc.timer.max.size=30 * fire.acc.log.max.size=22 * fire.jdbc.query.partitions=13 * fire.conf.test=flink * ================================ */ def printConf: Unit = { println("================================") println("fire.thread.pool.size=" + this.conf.getInt("fire.thread.pool.size", -1)) println("fire.thread.pool.schedule.size=" + this.conf.getInt("fire.thread.pool.schedule.size", -1)) println("fire.acc.timer.max.size=" + this.conf.getInt("fire.acc.timer.max.size", -1)) println("fire.acc.log.max.size=" + this.conf.getInt("fire.acc.log.max.size", -1)) println("fire.jdbc.query.partitions=" + this.conf.getInt("fire.jdbc.query.partitions", -1)) println("fire.conf.test=" + this.conf.getString("fire.conf.test")) println("================================") } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/streaming/DataGenTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.streaming import com.zto.fire._ import com.zto.fire.examples.bean.Student import com.zto.fire.spark.SparkStreaming import com.zto.fire.spark.anno.Streaming /** * 基于DataGenReceiver来随机生成测试数据集 * * @author ChengLong 2022-03-07 15:35:55 * @since 2.2.1 * * @contact Fire框架技术交流群(钉钉):35373471 */ @Streaming(20) object DataGenTest extends SparkStreaming { override def process: Unit = { // 方式一、在JavaBean中实现generate方法,在该方法中定义对象生成的规则 val dstream = this.fire.createBeanGenStream[Student](10) dstream.print(1) // 方式二、通过实现generateFun函数来定义数据生成规则 val dstream2 = this.fire.createDataGenStream(10, generateFun = Student.newStudentList()) dstream2.print(1) } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/streaming/KafkaTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.streaming import com.zto.fire._ import com.zto.fire.common.anno.Scheduled import com.zto.fire.common.util.DateFormatUtils import com.zto.fire.core.anno.connector.{Kafka, Kafka2, Kafka3} import com.zto.fire.spark.SparkStreaming import com.zto.fire.spark.anno.Streaming /** * kafka json解析 * * @author ChengLong 2019-6-26 16:52:58 * @contact Fire框架技术交流群(钉钉):35373471 */ @Streaming(interval = 10) @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") @Kafka2(brokers = "bigdata_test", topics = "fire2", groupId = "fire") @Kafka3(brokers = "bigdata_test", topics = "fire3", groupId = "fire") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object KafkaTest extends SparkStreaming { // 每天凌晨4点01将锁标志设置为false,这样下一个批次就可以先更新维表再执行sql @Scheduled(cron = "0 1 4 * * ?") def updateTableJob: Unit = this.lock.compareAndSet(true, false) // 用于缓存变更过的维表,只有当定时任务将标记设置为可更新时才会真正拉取最新的表 def cacheTable: Unit = { // 加载完成维表以后上锁 if (this.lock.compareAndSet(false, true)) { // cache维表逻辑 } } override def process: Unit = { val dstream = this.fire.createKafkaDirectStream() // 使用至少一次的算子语义,支持在rdd处理失败时自动重试,并且在处理成功后会主动提交offset dstream.foreachRDDAtLeastOnce(rdd => { // 更新并缓存维表动作,具体要根据锁的标记判断是否执行 this.cacheTable // 一、将json解析并注册为临时表,默认不cache临时表 rdd.kafkaJson2Table("test", cacheTable = true) // toLowerDF表示将大写的字段转为小写 sql("select * from test").toLowerDF.show(1, false) /*sql("select after.* from test").toLowerDF.show(1, false) sql("select after.* from test where after.order_type=1").toLowerDF.show(1, false)*/ // 二、直接将json按指定的schema解析(只解析after),fieldNameUpper=true表示按大写方式解析,并自动转为小写 // rdd.kafkaJson2DF(classOf[OrderCommon], fieldNameUpper = true).show(1, false) // 递归解析所有指定的字段,包括before、table、offset等字段 // rdd.kafkaJson2DF(classOf[OrderCommon], parseAll = true, fieldNameUpper = true, isMySQL = false).show(1, false) this.fire.uncache("test") }) val dstream2 = this.fire.createKafkaDirectStream(keyNum = 2) dstream2.print(1) val dstream3 = this.fire.createKafkaDirectStream(keyNum = 3) dstream3.count().foreachRDD(rdd => { println("count=" + rdd.count()) }) dstream3.print(1) } @Scheduled(fixedInterval = 60 * 1000, scope = "all") def loadTable: Unit = { println(s"${DateFormatUtils.formatCurrentDateTime()}=================== 每分钟执行loadTable ===================") this.conf.settings.foreach(conf => println(conf._1 + " -> " + conf._2)) } @Scheduled(cron = "0 0 * * * ?") def loadTable2: Unit = { println(s"${DateFormatUtils.formatCurrentDateTime()}=================== 每小时执行loadTable2 ===================") } @Scheduled(cron = "0 0 9 * * ?") def loadTable3: Unit = { println(s"${DateFormatUtils.formatCurrentDateTime()}=================== 每天9点执行loadTable3 ===================") } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/streaming/RocketTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.streaming import com.zto.fire._ import com.zto.fire.common.util.JSONUtils import com.zto.fire.core.anno.connector.{RocketMQ, RocketMQ2} import com.zto.fire.examples.bean.Student import com.zto.fire.spark.SparkStreaming import com.zto.fire.spark.anno.Streaming /** * 消费rocketmq中的数据 * * @contact Fire框架技术交流群(钉钉):35373471 */ @Streaming(10) @RocketMQ(brokers = "bigdata_test", topics = "fire", groupId = "fire", tag = "*") @RocketMQ2(brokers = "bigdata_test", topics = "fire2", groupId = "fire2", tag = "*", startingOffset = "latest") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object RocketTest extends SparkStreaming { override def process: Unit = { // 读取RocketMQ消息流 val dStream = this.fire.createRocketMqPullStream() this.fire.createRocketMqPullStream(keyNum = 2).print() dStream.foreachRDDAtLeastOnce(rdd => { val studentRDD = rdd.map(message => new String(message.getBody)).map(t => JSONUtils.parseObject[Student](t)).repartition(2) val insertSql = s"INSERT INTO spark_test2(name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" println("rocket.brokers.name=>" + this.conf.getString("rocket.brokers.name")) studentRDD.toDF().jdbcBatchUpdate(insertSql, Seq("name", "age", "createTime", "length", "sex"), batch = 100) })(reTry = 5, exitOnFailure = true) } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/structured/JdbcSinkTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.structured import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.core.anno.connector.{HBase, Hive, Kafka} import com.zto.fire.examples.bean.Student import com.zto.fire.spark.BaseStructuredStreaming /** * 结构化流测试 * * @contact Fire框架技术交流群(钉钉):35373471 */ @Hive("test") @HBase("test") @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object JdbcSinkTest extends BaseStructuredStreaming { override def process: Unit = { // 接入kafka并解析json,支持大小写,默认表名为kafka val kafkaDataset = this.fire.loadKafkaParseJson() // 直接使用或sql /*kafkaDataset.print() sql("select * from kafka").print()*/ // jdbc的sql语句 val insertSql = "insert into spark_test(name, age, createTime, length, sex, rowKey) values(?,?,?,?,?,?)" // 将流数据持续写入到关系型数据库中(插入部分列) kafkaDataset.select("data.name", "data.age", "data.createTime", "data.length", "data.sex", "data.rowKey").jdbcBatchUpdate(insertSql, keyNum = 6) // 插入所有列并在Seq中列举DataFrame指定顺序,该顺序必须与insertSql中的问号占位符存在绑定关系 kafkaDataset.select("data.*").jdbcBatchUpdate(insertSql, Seq("name", "age", "createTime", "length", "sex", "rowKey"), keyNum = 6) this.fire.createDataFrame(Student.newStudentList(), classOf[Student]).createOrReplaceTempViewCache("student") sql( """ |select | t.name, | s.length |from kafka t left join student s | on t.name=s.name |""".stripMargin).print() } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/structured/MapTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.structured import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.core.anno.connector.{Hive, Kafka} import com.zto.fire.examples.bean.Student import com.zto.fire.spark.BaseStructuredStreaming import org.apache.spark.sql.Encoders import com.zto.fire.spark.util.SparkUtils /** * 对结构化流执行map、mapPartition操作 * * @author ChengLong 2020年1月3日 18:00:59 * @contact Fire框架技术交流群(钉钉):35373471 */ @Hive("batch") @Kafka(brokers = "test", topics = "fire", groupId = "fire") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object MapTest extends BaseStructuredStreaming { override def process: Unit = { this.fire.loadKafkaParseJson() // 将字段转为与JavaBean对应的类型 val sqlDF = sql("select cast(age as int), createTime, cast(length as decimal), name, rowKey, cast(sex as boolean) from kafka") // 执行map操作 sqlDF.map(row => { // 执行任意的操作 println("=========hello===========") // 将row转为JavaBean SparkUtils.sparkRowToBean(row, classOf[Student]) // 指定Encoders,必须是具有schema的目标类型,map后的类型即为Encoders中要指定的类型。不支持对普通数值类型的map,必须是DateType的子类 })(Encoders.bean(classOf[Student])).print() // mapPartition操作 sqlDF.mapPartitions(it => SparkUtils.sparkRowToBean(it, classOf[Student]))(Encoders.bean(classOf[Student])).print() } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/structured/StructuredStreamingTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.structured import com.zto.fire._ import com.zto.fire.common.anno.Config import com.zto.fire.core.anno.connector.{Hive, Kafka} import com.zto.fire.spark.BaseStructuredStreaming /** * 使用fire进行structured streaming开发的demo * * @author ChengLong 2019年12月23日 22:16:59 * @contact Fire框架技术交流群(钉钉):35373471 */ @Hive("batch") @Kafka(brokers = "zmsNew", topics = "sjzn_spark_scan_send_topic", groupId = "fire") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object StructuredStreamingTest extends BaseStructuredStreaming { /** * structured streaming处理逻辑 */ override def process: Unit = { // 接入kafka消息,并将消息解析为DataFrame,同时注册临时表,表名默认为kafka,也可传参手动指定表名 val kafkaDataset = this.fire.loadKafkaParseJson() // 进行sql查询,支持嵌套的json,并且支持大小写的json sql("select table, after.bill_code, after.scan_site from kafka").print() // 使用api的方式进行查询操作 kafkaDataset.select("after.PDA_CODE", "after.bill_code").print(numRows = 1, truncate = false) } } ================================================ FILE: fire-examples/spark-examples/src/main/scala/com/zto/fire/examples/spark/thread/ThreadTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.thread import com.zto.fire._ import com.zto.fire.common.util.{DateFormatUtils, ThreadUtils} import com.zto.fire.core.anno.connector.Kafka import com.zto.fire.spark.SparkStreaming import com.zto.fire.spark.anno.Streaming /** * 在driver中启用线程池的示例 * 1. 开启子线程执行一个任务 * 2. 开启子线程执行周期性任务 * * @contact Fire框架技术交流群(钉钉):35373471 */ @Streaming(interval = 10, checkpoint = false, concurrent = 2) @Kafka(brokers = "bigdata_test", topics = "fire", groupId = "fire") // 以上注解支持别名或url两种方式如:@Hive(thrift://hive:9083),别名映射需配置到cluster.properties中 object ThreadTest extends SparkStreaming { override def main(args: Array[String]): Unit = { // 第二个参数为true表示开启checkPoint机制 this.init(10L, false) } /** * Streaming的处理过程强烈建议放到process中,保持风格统一 * 注:此方法会被自动调用,在以下两种情况下,必须将逻辑写在process中 * 1. 开启checkpoint * 2. 支持streaming热重启(可在不关闭streaming任务的前提下修改batch时间) */ override def process: Unit = { // 第一次执行时延迟两分钟,每隔1分钟执行一次showSchema函数 ThreadUtils.schedule(this.showSchema, 1, 1) // 以子线程方式执行print方法中的逻辑 ThreadUtils.run(this.print) val dstream = this.fire.createKafkaDirectStream() dstream.foreachRDD(rdd => { println("count--> " + rdd.count()) }) } /** * 以子线程方式执行一次 */ def print: Unit = { println("==========子线程执行===========") } /** * 查看表结构信息 */ def showSchema: Unit = { println(s"${DateFormatUtils.formatCurrentDateTime()}--------------> atFixRate <----------------") sql("use tmp") sql("show tables").show(false) } } ================================================ FILE: fire-examples/spark-examples/src/test/resources/ConfigCenterUnitTest.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # #fire.thread.pool.size=10 fire.thread.pool.size=6 fire.restful.max.thread=9 fire.thread.pool.schedule.size=5 fire.jdbc.query.partitions=11 fire.hbase.scan.repartitions=110 fire.acc.log.max.size=22 ================================================ FILE: fire-examples/spark-examples/src/test/resources/SparkSQLParserTest.properties ================================================ spark.fire.task.schedule.enable=false spark.fire.acc.enable=false ================================================ FILE: fire-examples/spark-examples/src/test/resources/common.properties ================================================ spark.fire.task.schedule.enable=false spark.fire.acc.enable=false ================================================ FILE: fire-examples/spark-examples/src/test/scala/com/zto/fire/examples/spark/anno/AnnoConfTest.scala ================================================ package com.zto.fire.examples.spark.anno /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import com.zto.fire.common.anno.Config import com.zto.fire.common.conf.{FireFrameworkConf, FireHiveConf, FireKafkaConf, FireRocketMQConf} import com.zto.fire.common.util.PropUtils import com.zto.fire.core.anno.connector._ import com.zto.fire.hbase.conf.FireHBaseConf import com.zto.fire.jdbc.conf.FireJdbcConf import com.zto.fire.spark.SparkStreaming import com.zto.fire.spark.anno.{Streaming, StreamingDuration} import com.zto.fire.spark.conf.FireSparkConf import org.junit.Test @Config( """ |hive.cluster=test |spark.max.parallelism=11 |""") @Hive(value = "batch", catalog = "hive_catalog", version = "1.1.1", partition = "dt") @HBase(value = "batch-new1", batchSize = 10, durability = "off", scanPartitions = 12, config = Array("hbase.zookeeper.property.clientPort=2181", "zookeeper.znode.parent = /hbase")) @HBase2(value = "batch-new2", tableMetaCache = false, batchSize = 10, storageLevel = "memory_only", config = Array("hbase.zookeeper.property.clientPort=2182", "zookeeper.znode.parent = /hbase2")) @HBase3(value = "batch-new3", scanPartitions = 11, family = "data", maxRetries = 5, config = Array("hbase.zookeeper.property.clientPort=2183", "zookeeper.znode.parent = /hbase3")) @Kafka(brokers = "localhost:2181", topics = "fire", groupId = "fire", startingOffset = "start", endingOffsets = "end", autoCommit = true, sessionTimeout = 10, requestTimeout = 11, pollInterval = 12, forceOverwriteStateOffset = true, forceAutoCommit = true, forceAutoCommitInterval = 10) @Kafka2(brokers = "127.0.0.1:2181", topics = "fire2", groupId = "fire2", startingOffset = "start2", endingOffsets = "end2", sessionTimeout = 100, requestTimeout = 110, pollInterval = 120) @Kafka3(brokers = "127.0.0.1:2181", topics = "fire3", groupId = "fire3", startFromTimestamp = 100, startFromGroupOffsets = true, config = Array[String]("hello=world", "scala=flink")) @RocketMQ(brokers = "rocketmq", topics = "fire", groupId = "fire", startingOffset = "new", tag = "a", autoCommit = true, config = Array[String]("hello=world", "scala=flink")) @RocketMQ2(brokers = "rocketmq2", topics = "fire2", groupId = "fire2", startingOffset = "new2", tag = "b", autoCommit = true, config = Array[String]("hello=world2", "scala=flink2")) @Jdbc(url = "jdbc:mysql://localhost:3306", username = "root1", password = "root1", maxPoolSize = 10, maxIdleTime = 10, batchSize = 51, flushInterval = 1000, logSqlLength = 20, storageLevel = "memory", queryPartitions = 12) @Jdbc2(url = "jdbc:mysql://192.168.0.1:3306", driver = "com.fire", username = "root2", minPoolSize = 9, initialPoolSize = 8, password = "root2", maxRetries = 6, config = Array[String]("hello=world", "scala=flink")) @Jdbc3(url = "jdbc:mysql://192.168.0.2:3306", username = "root3", isolationLevel = "read", password = "root3", acquireIncrement = 2) @StreamingDuration(value = 20, checkpoint = false) @Streaming(value = 10, interval = 11, checkpoint = true, concurrent = 3, maxRatePerPartition = 10, backpressure = false, backpressureInitialRate = 26, stopGracefullyOnShutdown = false) class AnnoConfTest extends SparkStreaming { /** * 测试@Streaming注解 */ @Test def testStreaming: Unit = { assert(FireSparkConf.confBathDuration == 11) assert(this.conf.getBoolean("spark.streaming.receiver.writeAheadLog.enable", false)) assert(this.conf.getInt("spark.streaming.concurrentJobs", 1) == 3) assert(this.conf.getLong("spark.streaming.kafka.maxRatePerPartition", 0) == 10) assert(!this.conf.getBoolean("spark.streaming.backpressure.enabled", false)) assert(!this.conf.getBoolean("spark.streaming.stopGracefullyOnShutdown", true)) } /** * 测试@Jdbc注解 */ @Test def testJdbc: Unit = { assert(FireJdbcConf.url().equals("jdbc:mysql://localhost:3306")) assert(FireJdbcConf.url(2).equals("jdbc:mysql://192.168.0.1:3306")) assert(FireJdbcConf.url(3).equals("jdbc:mysql://192.168.0.2:3306")) assert(FireJdbcConf.driverClass().equals("com.mysql.jdbc.Driver")) assert(FireJdbcConf.driverClass(2).equals("com.fire")) // TODO: 自动推断driver // assert(FireJdbcConf.driverClass(3).equals("com.mysql.jdbc.Driver")) assert(FireJdbcConf.user().equals("root1")) assert(FireJdbcConf.user(2).equals("root2")) assert(FireJdbcConf.user(3).equals("root3")) assert(FireJdbcConf.password().equals("root1")) assert(FireJdbcConf.password(2).equals("root2")) assert(FireJdbcConf.password(3).equals("root3")) assert(FireJdbcConf.maxPoolSize() == 10) assert(FireJdbcConf.initialPoolSize(2) == 8) assert(FireJdbcConf.isolationLevel(3).equals("read")) assert(FireJdbcConf.maxIdleTime() == 10) assert(FireJdbcConf.maxRetry(2) == 6) assert(FireJdbcConf.acquireIncrement(3) == 2) assert(FireJdbcConf.batchSize() == 51) assert(FireFrameworkConf.logSqlLength == 20) assert(FireJdbcConf.jdbcStorageLevel.equals("MEMORY")) assert(FireJdbcConf.jdbcFlushInterval() == 1000) assert(FireJdbcConf.jdbcQueryPartition == 12) // "hello=world", "scala=flink" PropUtils.sliceKeysByNum(FireJdbcConf.JDBC_C3P0_CONF_PREFIX, 2).foreach(kv => { if (kv._1.equals("hello")) assert(kv._2.equals("world")) if (kv._1.equals("scala")) assert(kv._2.equals("flink")) }) } /** * 测试@RocketMQ注解 */ @Test def testRocketMQ: Unit = { assert(FireRocketMQConf.rocketNameServer().equals("rocketmq")) assert(FireRocketMQConf.rocketTopics().equals("fire")) assert(FireRocketMQConf.rocketGroupId().equals("fire")) assert(FireRocketMQConf.rocketStartingOffset().equals("new")) assert(FireRocketMQConf.rocketConsumerTag().equals("a")) assert(FireRocketMQConf.rocketEnableAutoCommit()) // "hello=world", "scala=flink" PropUtils.sliceKeysByNum(FireRocketMQConf.rocketConfStart, 1).foreach(kv => { if (kv._1.equals("hello")) assert(kv._2.equals("world")) if (kv._1.equals("scala")) assert(kv._2.equals("flink")) }) assert(FireRocketMQConf.rocketNameServer(2).equals("rocketmq2")) assert(FireRocketMQConf.rocketTopics(2).equals("fire2")) assert(FireRocketMQConf.rocketGroupId(2).equals("fire2")) assert(FireRocketMQConf.rocketStartingOffset(2).equals("new2")) assert(FireRocketMQConf.rocketConsumerTag(2).equals("b")) assert(FireRocketMQConf.rocketEnableAutoCommit(2)) // "hello=world", "scala=flink" PropUtils.sliceKeysByNum(FireRocketMQConf.rocketConfStart, 2).foreach(kv => { if (kv._1.equals("hello")) assert(kv._2.equals("world2")) if (kv._1.equals("scala")) assert(kv._2.equals("flink2")) }) } /** * 测试@Kafka注解 */ @Test def testKafka: Unit = { assert(FireKafkaConf.kafkaBrokers().equals("localhost:2181")) assert(FireKafkaConf.kafkaTopics().equals("fire")) assert(FireKafkaConf.kafkaGroupId().equals("fire")) assert(FireKafkaConf.kafkaStartingOffset().equals("start")) assert(FireKafkaConf.kafkaEndingOffsets().equals("end")) assert(FireKafkaConf.kafkaEnableAutoCommit()) assert(FireKafkaConf.kafkaSessionTimeOut() == 10) assert(FireKafkaConf.kafkaRequestTimeOut() == 11) assert(FireKafkaConf.kafkaPollInterval() == 12) assert(FireKafkaConf.kafkaForceOverwriteStateOffset) assert(FireKafkaConf.kafkaForceCommit) assert(FireKafkaConf.kafkaForceCommitInterval == 10) assert(FireKafkaConf.kafkaBrokers(2).equals("127.0.0.1:2181")) assert(FireKafkaConf.kafkaTopics(2).equals("fire2")) assert(FireKafkaConf.kafkaGroupId(2).equals("fire2")) assert(FireKafkaConf.kafkaStartingOffset(2).equals("start2")) assert(FireKafkaConf.kafkaEndingOffsets(2).equals("end2")) assert(FireKafkaConf.kafkaSessionTimeOut(2) == 100) assert(FireKafkaConf.kafkaRequestTimeOut(2) == 110) assert(FireKafkaConf.kafkaPollInterval(2) == 120) assert(FireKafkaConf.kafkaStartFromTimeStamp(3) == 100) assert(FireKafkaConf.kafkaStartFromGroupOffsets(3)) // "hello=world", "scala=flink" PropUtils.sliceKeysByNum(FireKafkaConf.kafkaConfStart, 3).foreach(kv => { if (kv._1.equals("hello")) assert(kv._2.equals("world")) if (kv._1.equals("scala")) assert(kv._2.equals("flink")) }) } /** * 测试@Config注解 */ @Test def testConfig: Unit = { assert(this.conf.getInt("spark.max.parallelism", 10240) == 11) } /** * hive 注解断言 */ @Test def testHive: Unit = { // @Hive注解优先级低于@Config assert(FireHiveConf.hiveCluster.equals("batch")) assert(FireHiveConf.hiveVersion.equals("1.1.1")) assert(FireHiveConf.hiveCatalogName.equals("hive_catalog")) assert(FireHiveConf.partitionName.equals("dt")) this.logInfo("assert hive annotation success.") } /** * hbase 注解断言 */ @Test def tesHBase: Unit = { assert(FireHBaseConf.hbaseCluster().equals("batch-new1")) assert(FireHBaseConf.hbaseCluster(2).equals("batch-new2")) assert(FireHBaseConf.hbaseCluster(3).equals("batch-new3")) assert(FireHBaseConf.hbaseDurability(1).equals("off")) assert(!FireHBaseConf.tableExistsCache(2)) assert(FireHBaseConf.familyName(3).equals("data")) assert(FireHBaseConf.hbaseHadoopScanPartitions() == 12) assert(FireHBaseConf.hbaseHadoopScanPartitions(2) == 1200) assert(FireHBaseConf.hbaseBatchSize() == 10) assert(FireHBaseConf.hbaseBatchSize(2) == 10) assert(FireHBaseConf.hbaseMaxRetry(3) == 5) assert(FireHBaseConf.hbaseMaxRetry(2) == 3) assert(FireHBaseConf.hbaseStorageLevel(2).equals("MEMORY_ONLY")) assert(FireHBaseConf.hbaseBatchSize() == 10) assert(FireHBaseConf.hbaseHadoopScanPartitions(3) == 11) assert(this.conf.getString("spark.fire.hbase.conf.hbase.zookeeper.property.clientPort").equals("2181")) assert(this.conf.getString("fire.hbase.conf.zookeeper.znode.parent2").equals("/hbase2")) assert(this.conf.getString("spark.fire.hbase.conf.hbase.zookeeper.property.clientPort3").equals("2183")) } } ================================================ FILE: fire-examples/spark-examples/src/test/scala/com/zto/fire/examples/spark/conf/ConfigCenterUnitTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.conf import com.zto.fire.common.anno.{Config, TestStep} import com.zto.fire.examples.spark.core.SparkTester import com.zto.fire.spark.SparkCore import com.zto.fire.spark.anno.Streaming import com.zto.fire.spark.util.SparkUtils import org.junit.Test /** * 用于测试配置fire框架的优先级 * * @author ChengLong 2022-05-16 16:14:21 * @date 2022-05-16 16:14:26 * @since 2.2.2 */ @Config( """ |fire.acc.timer.max.size=30 |fire.acc.log.max.size=20 |fire.conf.test=java |fire.thread.pool.schedule.size=6 |fire.conf.test=spark |""") @Streaming(20) // spark streaming的批次时间 class ConfigCenterUnitTest extends SparkCore with SparkTester { /** * 配置信息打印 * * ================================ * fire.thread.pool.size=6 * fire.thread.pool.schedule.size=5 * fire.acc.timer.max.size=30 * fire.acc.log.max.size=22 * fire.jdbc.query.partitions=11 * fire.conf.test=spark * ================================ */ @Test @TestStep(step = 1, desc = "测试配置优先级") def assertConf: Unit = { this.logger.warn(s"================runtime is ${SparkUtils.getExecutorId}================") val poolSize = this.conf.getInt("fire.thread.pool.size", -1) val scheduleSize = this.conf.getInt("fire.thread.pool.schedule.size", -1) val accSize = this.conf.getInt("fire.acc.timer.max.size", -1) val logSize = this.conf.getInt("fire.acc.log.max.size", -1) val partitions = this.conf.getInt("fire.jdbc.query.partitions", -1) val test = this.conf.getString("fire.conf.test") this.logger.warn(s"fire.thread.pool.size=$poolSize") this.logger.warn(s"fire.thread.pool.schedule.size=$scheduleSize") this.logger.warn(s"fire.acc.timer.max.size=$accSize") this.logger.warn(s"fire.acc.log.max.size=$logSize") this.logger.warn(s"fire.jdbc.query.partitions=$partitions") this.logger.warn(s"fire.conf.test=$test") assert(poolSize == 6) assert(scheduleSize == 5) assert(accSize == 30) assert(logSize == 22) assert(partitions == 11) assert(test.equals("spark")) this.logger.warn(s"=======================================") } } ================================================ FILE: fire-examples/spark-examples/src/test/scala/com/zto/fire/examples/spark/core/BaseSparkTester.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.core import com.zto.fire._ import com.zto.fire.spark.BaseSpark import org.junit.{After, Before} /** * Spark 单元测试父接口,用于初始化fire与spark上下文 * * @author ChengLong * @date 2022-05-11 10:47:57 * @since 2.2.2 */ trait SparkTester extends BaseSpark { /** * 初始化fire框架与spark相关的运行时上下文 */ @Before def before: Unit = { this.init() } /** * 注销fire框架与spark的上下文信息 */ @After override def after: Unit = { if (noEmpty(this.sc, this.fire) && this.sc.isStopped) this.stop } } ================================================ FILE: fire-examples/spark-examples/src/test/scala/com/zto/fire/examples/spark/hbase/HBaseApiTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.hbase import com.zto.fire.common.anno.TestStep import com.zto.fire.common.util.LineageManager import com.zto.fire.core.anno.connector.{HBase, HBase2} import com.zto.fire.examples.bean.Student import com.zto.fire.hbase.HBaseConnector import com.zto.fire.predef._ import com.zto.fire.spark.SparkCore import org.junit.Assert._ import org.junit.Test /** * 用于单元测试HBaseConnector中的API * * @author ChengLong * @since 2.2.2 * @date 2022-05-11 13:51:22 */ @HBase("test") @HBase2(cluster = "test", scanPartitions = 3, storageLevel = "DISK_ONLY") class HBaseApiTest extends SparkCore with HBaseTester { /** * 用于测试以下api: * 1. 判断表是否存在 * 2. disable 表 * 3. create 表 */ @Test @TestStep(step = 0, desc = "DDL测试") def createTestTable: Unit = { if (HBaseConnector.isExists(this.tableName1)) HBaseConnector.dropTable(this.tableName1) assertEquals(HBaseConnector.isExists(this.tableName1), false) HBaseConnector.createTable(this.tableName1, Seq("info")) assertEquals(HBaseConnector.isExists(this.tableName1), true) if (HBaseConnector(2).isExists(this.tableName2)) HBaseConnector(2).dropTable(this.tableName2) assertEquals(HBaseConnector(2).isExists(this.tableName2), false) HBaseConnector(2).createTable(this.tableName2, "info", "data") assertEquals(HBaseConnector(2).isExists(this.tableName2), true) } /** * 测试表是否存在的缓存功能 */ @Test @TestStep(step = 2, desc = "增删改查API测试") def testTableExists: Unit = { val starTime = currentTime (1 to 10).foreach(i => { HBaseConnector.tableExists(this.tableName1) }) println("未开启缓存总耗时:" + (elapsed(starTime))) val starTime2 = currentTime (1 to 10).foreach(i => { HBaseConnector.isExists(this.tableName1) }) println("开启缓存总耗时:" + (elapsed(starTime2))) } /** * 测试插入多条记录 */ @Test @TestStep(step = 3, desc = "增删改查API测试") def testInsert: Unit = { // 批量插入 val studentList = Student.newStudentList().toSeq HBaseConnector.insert(this.tableName1, studentList) // get操作 val rowKeyList = (1 to 5).map(i => i.toString) val getStudentList = HBaseConnector.get(this.tableName1, classOf[Student], rowKeyList) assertEquals(getStudentList.size, 5) getStudentList.foreach(println) val scanList = HBaseConnector.scan(this.tableName1, classOf[Student], "2", "4") assertEquals(scanList.size, 2) scanList.foreach(println) } /** * 测试跨集群支持 */ @Test @TestStep(step = 4, desc = "多集群测试") def testMultiCluster: Unit = { HBaseConnector.truncateTable(this.tableName1) HBaseConnector(2).truncateTable(this.tableName2) val studentList1 = Student.newStudentList().toSeq HBaseConnector.insert(this.tableName1, studentList1) val scanStudentList1 = HBaseConnector.scan(this.tableName1, classOf[Student], "1", "6") assertEquals(scanStudentList1.size, 5) val studentList2 =Student.newStudentList() HBaseConnector(2).insert(this.tableName2, studentList2: _*) val scanStudentList2 = HBaseConnector(2).scan(this.tableName2, classOf[Student], "1", "6") assertEquals(scanStudentList2.size, 5) assertEquals(LineageManager.getDatasourceLineage.size(), 1) } /** * 测试多版本插入 * 注:多版本需要在Student类上声明@HConfig注解:@HConfig(nullable = true, multiVersion = true) */ @Test @TestStep(step = 5, desc = "多版本测试") def testMultiInsert: Unit = { val studentList = Student.newStudentList() HBaseConnector(2).insert(this.tableName2, studentList: _*) val students = HBaseConnector(2).get(this.tableName2, classOf[Student], "1", "2") assertEquals(students.size, 2) } } ================================================ FILE: fire-examples/spark-examples/src/test/scala/com/zto/fire/examples/spark/hbase/HBaseBaseTester.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.hbase import com.zto.fire._ import com.zto.fire.examples.bean.Student import com.zto.fire.examples.spark.core.SparkTester import com.zto.fire.hbase.HBaseConnector import org.junit.Before /** * 用于对HBaseAPI进行单元测试的工具trait * * @author ChengLong * @date 2022-05-11 13:52:25 * @since 2.2.2 */ trait HBaseTester extends SparkTester { val tableName1 = "fire_test_1" val tableName2 = "fire_test_2" @Before override def before: Unit = { super.before if (!HBaseConnector.isExists(this.tableName1)) HBaseConnector.createTable(this.tableName1, Seq("info")) if (!HBaseConnector.isExists(this.tableName2)) HBaseConnector.createTable(this.tableName2, Seq("info")) this.truncate } /** * 向HBase中插入数据 */ protected[this] def putData: Unit = { this.truncate val studentList = Student.newStudentList() this.fire.hbasePutList(this.tableName1, studentList) } protected[this] def truncate: Unit = { HBaseConnector.truncateTable(this.tableName1) HBaseConnector.truncateTable(this.tableName2, keyNum = 2) } } ================================================ FILE: fire-examples/spark-examples/src/test/scala/com/zto/fire/examples/spark/hbase/HBaseBulkUnitTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.hbase import com.zto.fire._ import com.zto.fire.common.anno.TestStep import com.zto.fire.core.anno.connector.{HBase, HBase2} import com.zto.fire.examples.bean.Student import com.zto.fire.hbase.HBaseConnector import com.zto.fire.spark.SparkCore import org.apache.spark.sql.{Encoders, Row} import org.junit.Test /** * 测试基于bulk的方式读写HBase * * @author ChengLong * @date 2022-05-11 15:01:10 * @since 2.2.2 */ @HBase("test") @HBase2(cluster = "test", scanPartitions = 3) class HBaseBulkUnitTest extends SparkCore with HBaseTester { /** * 使用id作为rowKey */ val buildStudentRowKey = (row: Row) => { row.getAs("id").toString } /** * 使用bulk的方式将rdd写入到hbase */ @Test @TestStep(step = 1, desc = "testHbaseBulkPutRDD") def testHbaseBulkPutRDD: Unit = { val rdd = this.fire.createRDD(Student.newStudentList(), 2) this.fire.hbaseBulkPutRDD(this.tableName1, rdd) this.assertResult } /** * 使用bulk的方式将DataFrame写入到hbase */ @Test @TestStep(step = 2, desc = "testHbaseBulkPutDF") def testHbaseBulkPutDF: Unit = { val rdd = this.fire.createRDD(Student.newStudentList(), 2) val studentDF = this.fire.createDataFrame(rdd, classOf[Student]) this.fire.hbaseBulkPutDF(this.tableName1, studentDF, classOf[Student]) this.assertResult } /** * 使用bulk的方式将Dataset写入到hbase */ @Test @TestStep(step = 3, desc = "testHbaseBulkPutDS") def testHbaseBulkPutDS: Unit = { val rdd = this.fire.createRDD(Student.newStudentList(), 2) val studentDataset = this.fire.createDataset(rdd)(Encoders.bean(classOf[Student])) this.fire.hbaseBulkPutDS(this.tableName1, studentDataset) this.assertResult } /** * 使用bulk方式批量删除指定的rowKey对应的数据 */ @Test @TestStep(step = 4, desc = "testHBaseBulkDeleteRDD") def testHBaseBulkDeleteRDD: Unit = { this.testHbaseBulkPutRDD val rowKeySeq = Seq(1.toString, 2.toString, 5.toString, 6.toString) this.fire.hbaseDeleteList(this.tableName1, rowKeySeq) val getList = rowKeySeq.map(rowKey => HBaseConnector.buildGet(rowKey)) val result = this.fire.hbaseGetList(this.tableName1, classOf[Student], getList) assert(result.isEmpty) } /** * 使用bulk方式批量删除指定的rowKey对应的数据 */ @Test @TestStep(step = 5, desc = "testHBaseBulkDeleteDS") def testHBaseBulkDeleteDS: Unit = { this.testHbaseBulkPutRDD val rowKeySeq = Seq(1.toString, 2.toString, 5.toString, 6.toString) val rowKeyRdd = this.fire.createRDD(rowKeySeq, 2) this.fire.createDataset(rowKeyRdd)(Encoders.STRING).hbaseBulkDeleteDS(this.tableName1) val getList = rowKeySeq.map(rowKey => HBaseConnector.buildGet(rowKey)) val result = this.fire.hbaseGetList(this.tableName1, classOf[Student], getList) assert(result.isEmpty) } /** * 通过查询结果断言是否正确 */ private def assertResult: Unit = { this.testHBaseBulkGetSeq this.testHBaseBulkGetRDD this.testHBaseBulkGetDF this.testHBaseBulkGetDS this.testHbaseBulkScanRDD this.testHbaseBulkScanDF this.testHbaseBulkScanDS } /** * 使用bulk方式根据rowKey集合获取数据,并将结果集以RDD形式返回 */ private def testHBaseBulkGetSeq: Unit = { val seq = Seq(1.toString, 2.toString, 3.toString, 5.toString, 6.toString) val studentRDD = this.fire.hbaseBulkGetSeq(this.tableName1, seq, classOf[Student]) assert(studentRDD.count() == 5) } /** * 使用bulk方式根据rowKey获取数据,并将结果集以RDD形式返回 */ private def testHBaseBulkGetRDD: Unit = { val rowKeyRdd = this.fire.createRDD(Seq(1.toString, 2.toString, 3.toString, 5.toString, 6.toString), 2) val studentRDD = rowKeyRdd.hbaseBulkGetRDD(this.tableName1, classOf[Student], keyNum = 2) assert(studentRDD.count() == 5) } /** * 使用bulk方式根据rowKey获取数据,并将结果集以DataFrame形式返回 */ private def testHBaseBulkGetDF: Unit = { val rowKeyRdd = this.fire.createRDD(Seq(1.toString, 2.toString, 3.toString, 5.toString, 6.toString, 111.toString), 2) val studentDF = this.fire.hbaseBulkGetDF(this.tableName1, rowKeyRdd, classOf[Student]) assert(studentDF.count() == 5) val rowKeyRdd2 = this.fire.createRDD(Seq[String](), 2) val studentDF2 = this.fire.hbaseBulkGetDF(this.tableName1, rowKeyRdd2, classOf[Student]) assert(studentDF2.count() == 0) } /** * 使用bulk方式根据rowKey获取数据,并将结果集以Dataset形式返回 */ private def testHBaseBulkGetDS: Unit = { val rowKeyRdd = this.fire.createRDD(Seq(1.toString, 2.toString, 3.toString, 5.toString, 6.toString), 2) val studentDS2 = this.fire.hbaseBulkGetDS(this.tableName1, rowKeyRdd, classOf[Student]) assert(studentDS2.count() == 5) } /** * 使用bulk方式进行scan,并将结果集映射为RDD */ private def testHbaseBulkScanRDD: Unit = { val scanRDD = this.fire.hbaseBulkScanRDD2(this.tableName1, classOf[Student], "1", "6") assert(scanRDD.count() == 5) } /** * 使用bulk方式进行scan,并将结果集映射为DataFrame */ private def testHbaseBulkScanDF: Unit = { val scanDF = this.fire.hbaseBulkScanDF2(this.tableName1, classOf[Student], "1", "6") assert(scanDF.count() == 5) } /** * 使用bulk方式进行scan,并将结果集映射为Dataset */ private def testHbaseBulkScanDS: Unit = { val scanDS = this.fire.hbaseBulkScanDS(this.tableName1, classOf[Student], HBaseConnector.buildScan("1", "6")) assert(scanDS.count() == 5) } } ================================================ FILE: fire-examples/spark-examples/src/test/scala/com/zto/fire/examples/spark/hbase/HBaseConnectorUnitTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.hbase import com.zto.fire._ import com.zto.fire.common.anno.TestStep import com.zto.fire.core.anno.connector.{HBase, HBase2} import com.zto.fire.examples.bean.{Student, StudentMulti} import com.zto.fire.hbase.HBaseConnector import com.zto.fire.spark.SparkCore import org.apache.hadoop.hbase.client.Get import org.apache.spark.sql.Encoders import org.junit.Test import java.nio.charset.StandardCharsets import scala.collection.mutable.ListBuffer /** * 在spark中使用java 同步 api (HBaseConnector) 的方式读写hbase表 * 注:适用于少量数据的实时读写,更轻量 * * @author ChengLong 2019-5-9 09:37:25 */ @HBase("test") @HBase2(cluster = "test", scanPartitions = 3, storageLevel = "DISK_ONLY") class HBaseConnectorUnitTest extends SparkCore with HBaseTester { /** * 使用HBaseConnector插入一个集合,可以是list、set等集合 * 但集合的类型必须为HBaseBaseBean的子类 */ @Test @TestStep(step = 1, desc = "testHbasePutList") def testHbasePutList: Unit = { this.putData val getList = ListBuffer[Get]() val rowKeys = Seq("1", "2", "3", "5", "6") rowKeys.map(rowkey => (getList += new Get(rowkey.getBytes(StandardCharsets.UTF_8)))) // 获取多版本形式存放的记录,并获取最新的两个版本就 val resultList = this.fire.hbaseGetList(this.tableName1, classOf[Student], getList) assert(resultList.size == 5) val resultList2 = this.fire.hbaseGetList2(this.tableName1, classOf[Student], rowKeys) assert(resultList2.size == 5) val scanResultList = this.fire.hbaseScanList2(this.tableName1, classOf[Student], "2", "6") assert(scanResultList.size == 4) } /** * 使用HBaseConnector插入一个rdd的数据 * rdd的类型必须为HBaseBaseBean的子类 */ @Test @TestStep(step = 2, desc = "testHbasePutRDD") def testHbasePutRDD: Unit = { val studentList = Student.newStudentList() val studentRDD = this.fire.createRDD(studentList, 2) // 为空的字段不插入 studentRDD.hbasePutRDD(this.tableName1) val getList = Seq("1", "2", "3", "5", "6") val getRDD = this.fire.createRDD(getList, 2) val resultRDD = this.fire.hbaseGetRDD(this.tableName1, classOf[Student], getRDD) assert(resultRDD.count() == 5) val scanResultRdd = this.fire.hbaseScanRDD2(this.tableName1, classOf[Student], "2", "6") assert(scanResultRdd.count() == 4) } /** * 使用HBaseConnector插入一个DataFrame的数据 */ @Test @TestStep(step = 3, desc = "testHBasePutDF") def testHBasePutDF: Unit = { val studentList = Student.newStudentList() val studentDF = this.fire.createDataFrame(studentList, classOf[Student]) studentDF.hbasePutDF(this.tableName1, classOf[Student]) val getList = Seq("1", "2", "3", "4", "5", "6") val getRDD = this.fire.createRDD(getList, 3) // get到的结果以dataframe形式返回 val resultDF = this.fire.hbaseGetDF(this.tableName1, classOf[Student], getRDD) assert(resultDF.count() == 6) val dataFrame = this.fire.hbaseScanDF2(this.tableName1, classOf[Student], "2", "6") assert(dataFrame.count() == 4) } /** * 使用HBaseConnector插入一个Dataset的数据 * dataset的类型必须为HBaseBaseBean的子类 */ @Test @TestStep(step = 4, desc = "testHBasePutDS") def testHBasePutDS: Unit = { val studentList = Student.newStudentList() val studentDS = this.fire.createDataset(studentList)(Encoders.bean(classOf[Student])) // 以多版本形式插入 studentDS.hbasePutDS(this.tableName1, classOf[Student]) val getList = Seq("1", "2", "3", "4", "5", "6") val getRDD = this.fire.createRDD(getList, 2) // 指定在多版本获取时只取最新的两个版本 val resultDS = this.fire.hbaseGetDS(this.tableName1, classOf[Student], getRDD) println(resultDS.count()) assert(resultDS.count() == 6) val dataSet = this.fire.hbaseScanDS2(this.tableName1, classOf[Student], "2", "6") assert(dataSet.count() == 4) } /** * 根据指定的rowKey list,批量删除指定的记录 */ @Test @TestStep(step = 5, desc = "testHbaseDeleteList") def testHbaseDeleteList: Unit = { this.putData val rowKeyList = Seq(1.toString, 2.toString, 5.toString, 8.toString) this.fire.hbaseDeleteList(this.tableName1, rowKeyList) val getList = rowKeyList.map(rowKey => HBaseConnector.buildGet(rowKey)) val result = this.fire.hbaseGetList(this.tableName1, classOf[Student], getList) assert(result.isEmpty) } /** * 根据指定的rowKey rdd,批量删除指定的记录 */ @Test @TestStep(step = 6, desc = "testHbasePutList") def testHBaseDeleteRDD: Unit = { this.putData val rowKeyList = Seq(1.toString, 2.toString, 3.toString, 4.toString, 5.toString, 6.toString, 7.toString, 8.toString, 9.toString, 10.toString) val rowKeyRDD = this.fire.createRDD(rowKeyList, 2) rowKeyRDD.hbaseDeleteRDD(this.tableName1) val getList = rowKeyList.map(rowKey => HBaseConnector.buildGet(rowKey)) val result = this.fire.hbaseGetList(this.tableName1, classOf[Student], getList) assert(result.isEmpty) } /** * 根据指定的rowKey dataset,批量删除指定的记录 */ @Test @TestStep(step = 6, desc = "testHbaseDeleteDS") def testHbaseDeleteDS: Unit = { this.putData val rowKeyList = Seq(1.toString, 2.toString, 5.toString, 8.toString) val rowKeyDS = this.fire.createDataset(rowKeyList)(Encoders.STRING) rowKeyDS.hbaseDeleteDS(this.tableName1) val getList = rowKeyList.map(rowKey => HBaseConnector.buildGet(rowKey)) val result = this.fire.hbaseGetList(this.tableName1, classOf[Student], getList) assert(result.isEmpty) } /** * 测试多版本 */ @Test @TestStep(step = 7, desc = "testMultiVersion") def testMultiVersion: Unit = { val studentList = StudentMulti.newStudentMultiList() val studentDF = this.fire.createDataFrame(studentList, classOf[StudentMulti]) studentDF.hbasePutDF(this.tableName2, classOf[StudentMulti]) val getList = Seq("1", "2", "3", "4", "5", "6") val getRDD = this.fire.createRDD(getList, 3) // get到的结果以dataframe形式返回 val resultDF = this.fire.hbaseGetDF(this.tableName2, classOf[StudentMulti], getRDD) assert(resultDF.count() == 6) val dataFrame = this.fire.hbaseScanDF2(this.tableName2, classOf[StudentMulti], "2", "6") assert(dataFrame.count() == 4) dataFrame.show() } } ================================================ FILE: fire-examples/spark-examples/src/test/scala/com/zto/fire/examples/spark/hbase/HBaseHadoopUnitTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.hbase import com.zto.fire._ import com.zto.fire.common.anno.TestStep import com.zto.fire.core.anno.connector.{HBase, HBase2} import com.zto.fire.examples.bean.Student import com.zto.fire.spark.SparkCore import org.apache.spark.sql.{Encoders, Row} import org.junit.Test /** * 测试基于hadoop的方式读写HBase * * @author ChengLong * @date 2022-05-11 15:26:10 * @since 2.2.2 */ @HBase("test") @HBase2(cluster = "test", scanPartitions = 3) class HBaseHadoopUnitTest extends SparkCore with HBaseTester { /** * 基于saveAsNewAPIHadoopDataset封装,将rdd数据保存到hbase中 */ @Test @TestStep(step = 1, desc = "testHbaseHadoopPutRDD") def testHbaseHadoopPutRDD: Unit = { val studentRDD = this.fire.createRDD(Student.newStudentList(), 2) this.fire.hbaseHadoopPutRDD(this.tableName1, studentRDD, keyNum = 2) this.assertScan } /** * 基于saveAsNewAPIHadoopDataset封装,将DataFrame数据保存到hbase中 */ @Test @TestStep(step = 2, desc = "testHbaseHadoopPutDF") def testHbaseHadoopPutDF: Unit = { val studentRDD = this.fire.createRDD(Student.newStudentList(), 2) val studentDF = this.fire.createDataFrame(studentRDD, classOf[Student]) this.fire.hbaseHadoopPutDF(this.tableName1, studentDF, classOf[Student]) this.assertScan } /** * 基于saveAsNewAPIHadoopDataset封装,将Dataset数据保存到hbase中 */ @Test @TestStep(step = 3, desc = "testHbaseHadoopPutDS") def testHbaseHadoopPutDS: Unit = { val studentDS = this.fire.createDataset(Student.newStudentList())(Encoders.bean(classOf[Student])) this.fire.hbaseHadoopPutDS(this.tableName1, studentDS) this.assertScan } /** * 基于saveAsNewAPIHadoopDataset封装,将不是HBaseBaseBean结构对应的DataFrame保存到hbase中 * 注:此方法与hbaseHadoopPutDF不同之处在于,它不强制要求该DataFrame一定要与HBaseBaseBean的子类对应 * 但需要指定rowKey的构建规则,相对与hbaseHadoopPutDF来说,少了中间的两次转换,性能会更高 */ @Test @TestStep(step = 4, desc = "testHbaseHadoopPutDFRow") def testHbaseHadoopPutDFRow: Unit = { /** * 构建main_order rowkey */ val buildRowKey = (row: Row) => { // 将id字段作为rowKey row.getAs("id").toString } val studentRDD = this.fire.createRDD(Student.newStudentList(), 2) this.fire.createDataFrame(studentRDD, classOf[Student]).createOrReplaceTempView("student") // 指定rowKey构建的函数 sql("select age,createTime,id,length,name,sex from student").hbaseHadoopPutDFRow(this.tableName1, buildRowKey) this.assertScan } /** * 断言scan结果 */ private def assertScan: Unit = { this.testHBaseHadoopScanRDD this.testHBaseHadoopScanDF this.testHBaseHadoopScanDS } /** * 使用Spark的方式scan海量数据,并将结果集映射为RDD */ private def testHBaseHadoopScanRDD: Unit = { val studentRDD = this.fire.hbaseHadoopScanRDD2(this.tableName1, classOf[Student], "1", "6", keyNum = 2) assert(studentRDD.count() == 5) } /** * 使用Spark的方式scan海量数据,并将结果集映射为DataFrame */ private def testHBaseHadoopScanDF: Unit = { val studentDF = this.fire.hbaseHadoopScanDF2(this.tableName1, classOf[Student], "1", "6") assert(studentDF.count() == 5) studentDF.show() } /** * 使用Spark的方式scan海量数据,并将结果集映射为Dataset */ private def testHBaseHadoopScanDS: Unit = { val studentDS = this.fire.hbaseHadoopScanDS2(this.tableName1, classOf[Student], "1", "6") assert(studentDS.count() == 5) } } ================================================ FILE: fire-examples/spark-examples/src/test/scala/com/zto/fire/examples/spark/hive/HiveUnitTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.hive import com.zto.fire.common.anno.TestStep import com.zto.fire.core.anno.connector.Hive import com.zto.fire.examples.spark.core.SparkTester import com.zto.fire.spark.SparkCore import org.junit.Test /** * 用于测试与hive的集成 * * @author ChengLong * @date 2022-05-12 14:56:36 * @since 2.2.2 */ @Hive("test") class HiveUnitTest extends SparkCore with SparkTester { @Test @TestStep(step = 1, desc = "测试列出所有的数据库名称") def testShowDatabases: Unit = { val df = sql("show databases") assert(df.count() > 3) } @Test @TestStep(step = 1, desc = "测试列出tmp库下所有的hive表名称") def testShowTables: Unit = { sql("use tmp") val df = sql("show tables") assert(df.count() > 10) } } ================================================ FILE: fire-examples/spark-examples/src/test/scala/com/zto/fire/examples/spark/jdbc/JdbcConnectorTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.jdbc import com.zto.fire.common.anno.TestStep import com.zto.fire.core.anno.connector.{Jdbc, Jdbc3} import com.zto.fire.examples.bean.Student import com.zto.fire.examples.spark.core.SparkTester import com.zto.fire.jdbc.JdbcConnector import com.zto.fire.predef._ import com.zto.fire.spark.SparkCore import org.junit.Assert._ import org.junit.{After, Before, Test} /** * 用于测试JdbcConnector相关API * * @author ChengLong * @since 2.2.2 * @create 2022-05-12 13:26:11 */ @Jdbc(url = "jdbc:derby:memory:fire;create=true", username = "fire", password = "fire", driver = "org.apache.derby.jdbc.EmbeddedDriver") @Jdbc3(url = "jdbc:derby:memory:fire2;create=true", username = "fire", password = "fire", maxPoolSize = 1, driver = "org.apache.derby.jdbc.EmbeddedDriver") class JdbcConnectorTest extends SparkCore with SparkTester { private var jdbc: JdbcConnector = _ private var jdbc3: JdbcConnector = _ private val tableName = "t_student" private val createTable = s""" |CREATE TABLE $tableName( | id BIGINT, | name VARCHAR(100), | age INT, | createTime VARCHAR(20), | length double, | sex CHAR, | rowkey VARCHAR(100) |) |""".stripMargin @Before override def before: Unit = { super.before this.jdbc = JdbcConnector() this.jdbc.executeUpdate(this.createTable) this.jdbc3 = JdbcConnector(keyNum = 3) this.jdbc3.executeUpdate(this.createTable) } /** * 基于derby数据库进行crud测试 */ @Test @TestStep(step = 1, desc = "jdbc CRUD测试") def testCRUD: Unit = { val studentName = "root" val deleteSql = s"delete from $tableName where name=?" this.jdbc.executeUpdate(deleteSql, Seq(studentName)) this.jdbc3.executeUpdate(deleteSql, Seq(studentName)) val selectSql = s"select * from $tableName where name=?" val studentList1 = this.jdbc.executeQueryList(selectSql, Seq(studentName), classOf[Student]) val studentList3 = this.jdbc3.executeQueryList(selectSql, Seq(studentName), classOf[Student]) assertEquals(studentList1.size, 0) studentList1.foreach(println) assertEquals(studentList3.size, 0) studentList3.foreach(println) val insertSql = s"insert into $tableName(name, age, length) values(?, ?, ?)" this.jdbc.executeUpdate(insertSql, Seq(studentName, 10, 10.3)) this.jdbc3.executeUpdate(insertSql, Seq(studentName, 10, 10.3)) val studentList11 = this.jdbc.executeQueryList(selectSql, Seq(studentName), classOf[Student]) val studentList33 = this.jdbc3.executeQueryList(selectSql, Seq(studentName), classOf[Student]) assertEquals(studentList11.size, 1) studentList11.foreach(println) assertEquals(studentList33.size, 1) studentList33.foreach(println) } @After override def after: Unit = { this.jdbc.executeUpdate(s"drop table $tableName") this.jdbc3.executeUpdate(s"drop table $tableName") } } ================================================ FILE: fire-examples/spark-examples/src/test/scala/com/zto/fire/examples/spark/jdbc/JdbcUnitTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.jdbc import com.zto.fire._ import com.zto.fire.common.anno.TestStep import com.zto.fire.common.util.DateFormatUtils import com.zto.fire.core.anno.connector.{Jdbc, Jdbc2} import com.zto.fire.examples.bean.Student import com.zto.fire.examples.spark.core.SparkTester import com.zto.fire.spark.SparkCore import org.junit.Test /** * Spark jdbc相关api单元测试 * * @author ChengLong * @date 2022-05-12 13:49:24 * @since 2.2.2 */ @Jdbc(url = "jdbc:mysql://mysql-server:3306/fire", username = "root", password = "root") @Jdbc2(url = "jdbc:mysql://mysql-server:3306/fire", username = "root", password = "root") class JdbcUnitTest extends SparkCore with SparkTester { lazy val tableName = "spark_test" /** * 使用jdbc方式对关系型数据库进行增删改操作 */ @Test @TestStep(step = 1, desc = "测试基本的增删改查api") def testCRUD: Unit = { this.truncate val timestamp = DateFormatUtils.formatCurrentDateTime() // 执行insert操作 val insertSql = s"INSERT INTO $tableName (name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" this.fire.jdbcUpdate(insertSql, Seq("admin", 12, timestamp, 10.0, 1)) var resultList = this.fire.jdbcQueryList(s"select id, name, age, createTime, length, sex from $tableName where id=1", null, classOf[Student]) assert(resultList.head.getName.equals("admin")) // 更新配置文件中指定的第二个关系型数据库 this.fire.jdbcUpdate(insertSql, Seq("admin", 12, timestamp, 10.0, 1), keyNum = 2) resultList = this.fire.jdbcQueryList(s"select id, name, age, createTime, length, sex from $tableName where id=1", null, classOf[Student], keyNum = 2) assert(resultList.head.getName.equals("admin")) // 执行更新操作 val updateSql = s"UPDATE $tableName SET name=? WHERE id=?" this.fire.jdbcUpdate(updateSql, Seq("root", 1)) resultList = this.fire.jdbcQueryList(s"select id, name, age, createTime, length, sex from $tableName where id=1", null, classOf[Student]) assert(resultList.head.getName.equals("root")) // 执行批量操作 this.initData resultList = this.fire.jdbcQueryList(s"select id, name, age, createTime, length, sex from $tableName", null, classOf[Student]) assert(resultList.size == 5) this.fire.jdbcBatchUpdate(s"update $tableName set sex=? where id=?", Seq(Seq(1, 1), Seq(2, 2), Seq(3, 3), Seq(4, 4), Seq(5, 5), Seq(6, 6))) val sql = s"DELETE FROM $tableName WHERE id=?" this.fire.jdbcUpdate(sql, Seq(2)) resultList = this.fire.jdbcQueryList(s"select id, name, age, createTime, length, sex from $tableName where id=2", null, classOf[Student]) assert(resultList.isEmpty) } /** * 使用jdbc方式对关系型数据库进行查询操作 */ @Test @TestStep(step = 2, desc = "测试查询相关的API") def testJdbcQuery: Unit = { this.initData val sql = s"select * from $tableName where id in (?, ?, ?)" // 将查询结果集以List[JavaBean]方式返回 val list = this.fire.jdbcQueryList(sql, Seq(1, 2, 3), classOf[Student]) // 方式二:使用JdbcConnector assert(list.size == 3) // 将结果集封装到RDD中 val rdd = this.fire.jdbcQueryRDD(sql, Seq(1, 2, 3)) assert(rdd.count() == 3) // 将结果集封装到DataFrame中 val df = this.fire.jdbcQueryDF(sql, Seq(1, 2, 3)) assert(df.count() == 3) } /** * 使用spark方式对表进行数据加载操作 */ @Test @TestStep(step = 3, desc = "测试基于spark的方式查询的数据结果") def testTableLoad: Unit = { this.initData // 一次加载整张的jdbc小表,注:大表严重不建议使用该方法 val df = this.fire.jdbcTableLoadAll(this.tableName) assert(df.count() == 5) // 根据指定分区字段的上下边界分布式加载数据 this.fire.jdbcTableLoadBound(this.tableName, "id", 1, 10, 2).show(100, false) val where = Array[String]("id >=1 and id <=3", "id >=6 and id <=9", "name='root'") // 根据指定的条件进行数据加载,条件的个数决定了load数据的并发度 val df2 = this.fire.jdbcTableLoad(tableName, where) assert(df2.count() == 3) } /** * 批量插入测试数据 */ private def initData: Unit = { this.truncate val timestamp = DateFormatUtils.formatCurrentDateTime() // 执行批量操作 val batchSql = s"INSERT INTO $tableName (name, age, createTime, length, sex) VALUES (?, ?, ?, ?, ?)" this.fire.jdbcBatchUpdate(batchSql, Seq(Seq("spark1", 21, timestamp, 100.123, 1), Seq("flink2", 22, timestamp, 12.236, 0), Seq("flink3", 22, timestamp, 12.236, 0), Seq("flink4", 22, timestamp, 12.236, 0), Seq("flink5", 27, timestamp, 17.236, 0))) } /** * 清空表 */ private def truncate: Unit = { this.fire.jdbcUpdate(s"truncate table $tableName") } } ================================================ FILE: fire-examples/spark-examples/src/test/scala/com/zto/fire/examples/spark/parser/SparkSQLParserTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.examples.spark.parser import com.zto.fire.common.anno.{Config, TestStep} import com.zto.fire.core.anno.connector.Hive import com.zto.fire.common.bean.TableIdentifier import com.zto.fire.examples.bean.Student import com.zto.fire.examples.spark.core.SparkTester import com.zto.fire.println import com.zto.fire.spark.SparkCore import com.zto.fire.spark.sql.SparkSqlParser import org.junit.Test /** * 用于测试Spark SQL解析器 * * @author ChengLong * @date 2022年09月06日 13:58:59 * @since 2.3.2 */ @Hive("test") class SparkSQLParserTest extends SparkCore with SparkTester { val student = TableIdentifier("student") val baseorganize = TableIdentifier("dim.baseorganize") val baseuser = TableIdentifier("dim.baseuser") @Test @TestStep(step = 1, desc = "判断表属性") def testTable: Unit = { this.spark.createDataFrame(Student.newStudentList(), classOf[Student]).createOrReplaceTempView("student") println("student view: " + SparkSqlParser.isTempView(student)) assert(SparkSqlParser.isTempView(student)) println("student table: " + SparkSqlParser.isHiveTable(student)) assert(!SparkSqlParser.isHiveTable(student)) println("baseorganize view: " + SparkSqlParser.isTempView(baseorganize)) assert(!SparkSqlParser.isTempView(baseorganize)) println("baseorganize table: " + SparkSqlParser.isHiveTable(baseuser)) assert(SparkSqlParser.isHiveTable(baseorganize)) println("baseuser view: " + SparkSqlParser.isTempView(baseuser)) assert(!SparkSqlParser.isTempView(baseuser)) println("baseuser table: " + SparkSqlParser.isHiveTable(baseuser)) assert(SparkSqlParser.isHiveTable(baseuser)) } } ================================================ FILE: fire-examples/spark-examples/src/test/scala-spark-3.0/com/zto/fire/examples/spark/sql/SparkSqlParseTest.scala ================================================ package com.zto.fire.examples.spark.sql /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import com.zto.fire.core.anno.connector.Hive import com.zto.fire.examples.bean.Student import com.zto.fire.spark.SparkCore import com.zto.fire.spark.sql.SparkSqlParser import org.junit.{Before, Test} import scala.util.Try /** * Spark SQL血缘解析单元测试 * * @author ChengLong * @Date 2022-04-29 13:37:30 * @since 2.2.1 */ @Hive("test") class SparkSqlParseTest extends SparkCore { @Before def before: Unit = { this.init() } /** * 用于批量断言sql解析 */ private def assertSqlParse(assertMsg: String, sqls: String*): Unit = { sqls.foreach(sql => { val retVal = Try { SparkSqlParser.sqlParser(sql) } assert(retVal.isSuccess, assertMsg) }) } /** * 断言临时表与hive表 */ @Test def testTempView: Unit = { val ds = this.fire.createDataFrame(Student.newStudentList(), classOf[Student]) ds.createOrReplaceTempView("t_student") /*assert(SparkSqlParser.isTempView("t_student")) assert(!SparkSqlParser.isTempView(null, "t_student2")) assert(!SparkSqlParser.isHiveTable(null, "t_student")) assert(SparkSqlParser.isHiveTable("dim", "baseuser")) assert(!SparkSqlParser.isHiveTable("dim", "baseuser12"))*/ } /** * 断言select语句的解析 */ @Test def testSelect: Unit = { val select1 = """ |select count(*) |from (select * from st.st_fwzl_transfer_kpi_detail_month) a |left join (select biz_no,bill_code from dw.dw_kf_center_to_center_dispatch_delay where ds>='20210101') b |on a.bill_code=b.bill_code |""".stripMargin val select2 = """ |select bill_event_id,count(*) from hudi.hudi_bill_item group by bill_event_id |""".stripMargin this.assertSqlParse("select spark sql解析失败", select1, select2) } /** * 测试insert语句的解析 */ @Test def testInsert: Unit = { val insertInto = """ |insert into ods.base select a,v from tmp.t_user t1 left join ods.test t2 on t1.id=t2.id |""".stripMargin val insertOverwrite = "insert overwrite table dw.kwang_test partition(ds='202106', city='beijing') values(4,'zz')" val insertSelect = """ |insert into zto_cockpit_site_target_ds |SELECT site_id,scan_date,scan_day, |SUM(a.rec_cnt) rec_cnt, |SUM(a.order_cnt) order_cnt, |SUM(a.disp_cnt) disp_cnt, |SUM(a.sign_cnt) sign_cnt, |SUM(a.ele_cnt) ele_cnt, |SUM(a.bag_cnt) bag_cnt |FROM ( |SELECT t1.site_id,t1.scan_date,t1.scan_day , |t1.cnt rec_cnt, |0 order_cnt, |0 disp_cnt, |0 sign_cnt, |t1.ele_cnt ele_cnt, |t1.bag_cnt bag_cnt |FROM ztkb.zto_cockpit_site_rec_ds t1 |WHERE t1.scan_day = '#date#' |UNION ALL |SELECT t2.site_id,t2.order_date scan_date,t2.order_day scan_day , |0 rec_cnt, |t2.cnt order_cnt, |0 disp_cnt, |0 sign_cnt, |0 ele_cnt, |0 bag_cnt |FROM ztkb.zto_cockpit_site_order_ds t2 |WHERE t2.order_day = '#date#' |UNION ALL |SELECT t3.site_id,t3.scan_date,t3.scan_day , |0 rec_cnt, |0 order_cnt, |t3.cnt disp_cnt, |0 sign_cnt, |0 ele_cnt, |0 bag_cnt |FROM ztkb.zto_cockpit_site_disp_ds t3 |WHERE t3.scan_day = '#date#' |UNION ALL |select t.record_site_id site_id,t.sign_date scan_date,t.sign_day scan_day, |0 rec_cnt, |0 order_cnt, |0 disp_cnt, |sum(t.cnt) sign_cnt, |0 ele_cnt, |0 bag_cnt |from ztkb.zto_cockpit_site_sign_ds t |where t.sign_day = '#date#' |group by t.record_site_id,t.sign_date,t.sign_day |) a |GROUP BY site_id,scan_date,scan_day """.stripMargin this.assertSqlParse("insert spark sql解析失败", insertInto, insertOverwrite, insertSelect) } /** * 测试alter语句解析 */ @Test def testAlter: Unit = { val alterTableAddPartitionStatement = """ |alter table tmp.t_user add if not exists partition (ds='20210620', city = 'beijing') |""".stripMargin val renameTable = """ |alter table tmp.t_user rename to ods.t_user2 |""".stripMargin val dropPartition = """ |ALTER TABLE tmp.food DROP IF EXISTS PARTITION (ds='20151219', city = 'beijing') |""".stripMargin val renamePartition = """ |Alter table tmp.test partition (ds='201801', city='beijing') rename to partition(ds='202106', city='shanghai') |""".stripMargin this.assertSqlParse("解析alter语句失败", alterTableAddPartitionStatement, renameTable, dropPartition, renamePartition) } /** * 测试ddl语句的解析 */ @Test def testDDL: Unit = { val createTable = """ |CREATE TABLE `tmp.test`( | `dept_no` int, | `addr` string, | `tel` string) |partitioned by(ds string, city string) |ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' |""".stripMargin val dropTable = """ |drop table if exists tmp.test |""".stripMargin val dropDB = "drop database tmp" this.assertSqlParse("ddl语句解析失败", createTable, dropTable, dropDB) } } ================================================ FILE: fire-external/.gitignore ================================================ # use glob syntax. syntax: glob *.ser *.class *~ *.bak #*.off *.old # eclipse conf file .settings .classpath .project .manager .scala_dependencies # idea .idea *.iml # building target build null tmp* temp* dist test-output build.log # other scm .svn .CVS .hg* # switch to regexp syntax. # syntax: regexp # ^\.pc/ #SHITTY output not in target directory build.log ================================================ FILE: fire-external/fire-apollo/pom.xml ================================================ 4.0.0 fire-apollo_${scala.binary.version} jar Fire : Apollo com.zto.fire fire-external 2.3.2-SNAPSHOT ../pom.xml com.ctrip.framework.apollo apollo-client 1.7.0 org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-external/fire-apollo/src/main/resources/apollo.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # config app.id=fire dev.meta=http://localhost fat.meta=http://localhost uat.meta=http://localhost pro.meta=http://localhost sit.meta=http://localhost ================================================ FILE: fire-external/fire-apollo/src/main/scala/com/zto/fire/apollo/util/ApolloConfigUtil.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.apollo.util import com.zto.fire.common.util.{Logging, PropUtils} import org.apache.commons.lang3.StringUtils import java.util.Properties object ApolloConfigUtil extends Logging { private val props = new Properties() this.load() def load(): Unit = { var apolloEnv = System.getProperty(ApolloConstant.APOLLO_ENV) if(StringUtils.isNotBlank(apolloEnv)){ apolloEnv = apolloEnv + ApolloConstant.APOLLO_META_SUFFIX }else{ apolloEnv = ApolloConstant.APOLLO_META_DEV } PropUtils.load(ApolloConstant.APOLLO_CONFIG_FILE) val appId = PropUtils.getString(ApolloConstant.APOLLO_APP_ID) val apolloMeta = PropUtils.getString(apolloEnv) if(StringUtils.isBlank(System.getProperty(ApolloConstant.APOLLO_APP_ID))) { System.setProperty(ApolloConstant.APOLLO_APP_ID, appId) } if(StringUtils.isBlank(System.getProperty(ApolloConstant.APOLLO_META))) { System.setProperty(ApolloConstant.APOLLO_META, apolloMeta) } val config = ConfigService.getAppConfig for (key <- config.getPropertyNames) { props.setProperty(key, config.getProperty(key, null)) } val changeListener = new ConfigChangeListener() { override def onChange(changeEvent: ConfigChangeEvent): Unit = { logger.info("Changes for namespace {}", changeEvent.getNamespace) for (key <- changeEvent.changedKeys) { val change = changeEvent.getChange(key) props.setProperty(change.getPropertyName, change.getNewValue) logger.info("Change - key: {}, oldValue: {}, newValue: {}, changeType: {}", change.getPropertyName, change.getOldValue, change.getNewValue, change.getChangeType) } } } config.addChangeListener(changeListener) } /** * 返回配置 * @return */ def getProp(): Properties ={ props } /** * 根据key获取配置信息 * * @param key * 配置的key * @return * 配置的value */ def getProperty(key: String, default: String = null): String = { props.getProperty(key, default) } /** * 获取字符串 * * @param key * @return */ def getString(key: String): String = { this.getProperty(key) } /** * 获取拼接后数值的配置字符串 * * @param key 配置的前缀 * @param keyNum 拼接到key后的数值后缀 * @return * 对应的配置信息 */ def getString(key: String, keyNum: Int = 0, default: String = ""): String = { if (keyNum <= 1) { var value = this.getProperty(key) if (StringUtils.isBlank(value)) { value = this.getString(key + "1", default) } value } else { this.getString(key + keyNum, default) } } /** * 获取字符串,为空则取默认值 * * @param key * @return */ def getString(key: String, default: String): String = { val value = this.getProperty(key) if (StringUtils.isNotBlank(value)) value else default } /** * 获取整型数据 * * @param key * @return */ def getInt(key: String): Int = { val value = this.getProperty(key) if (StringUtils.isNotBlank(value)) value.toInt else -1 } /** * 获取整型数据 * * @param key * @return */ def getInt(key: String, default: Int): Int = { val value = this.getProperty(key) if (StringUtils.isNotBlank(value)) value.toInt else default } /** * 获取拼接后数值的配置整数 * * @param key 配置的前缀 * @param keyNum 拼接到key后的数值后缀 * @return * 对应的配置信息 */ def getInt(key: String, keyNum: Int = 0, default: Int): Int = { val value = this.getString(key, keyNum, default + "") if (StringUtils.isNotBlank(value)) value.toInt else default } /** * 获取长整型数据 * * @param key * @return */ def getLong(key: String): Long = { val value = this.getProperty(key) if (StringUtils.isNotBlank(value)) value.toLong else -1L } /** * 获取长整型数据 * * @param key * @return */ def getLong(key: String, default: Long): Long = { val value = this.getProperty(key) if (StringUtils.isNotBlank(value)) value.toLong else default } /** * 获取float型数据 * * @param key * @return */ def getFloat(key: String): Float = { val value = this.getProperty(key) if (StringUtils.isNotBlank(value)) value.toFloat else -1 } /** * 获取float型数据 * * @param key * @return */ def getFloat(key: String, default: Float): Float = { val value = this.getProperty(key) if (StringUtils.isNotBlank(value)) value.toFloat else default } /** * 获取float型数据 * * @param key * @return */ def getDouble(key: String): Double = { val value = this.getProperty(key) if (StringUtils.isNotBlank(value)) value.toDouble else -1.0 } /** * 获取float型数据 * * @param key * @return */ def getDouble(key: String, default: Double): Double = { val value = this.getProperty(key) if (StringUtils.isNotBlank(value)) value.toDouble else default } /** * 获取拼接后数值的配置长整数 * * @param key 配置的前缀 * @param keyNum 拼接到key后的数值后缀 * @return * 对应的配置信息 */ def getLong(key: String, keyNum: Int = 0, default: Long): Long = { val value = this.getString(key, keyNum, default + "") if (StringUtils.isNotBlank(value)) value.toLong else default } /** * 获取布尔值数据 */ def getBoolean(key: String): Boolean = { this.getProperty(key, "false").toBoolean } /** * 获取布尔值数据 */ def getBoolean(key: String, default: Boolean): Boolean = { val value = this.getBoolean(key) if (value != null) value else default } /** * 获取拼接后数值的配置布尔值 * * @param key 配置的前缀 * @param keyNum 拼接到key后的数值后缀 * @return * 对应的配置信息 */ def getBoolean(key: String, keyNum: Int = 0, default: Boolean): Boolean = { val value = this.getString(key, keyNum, default + "") if (StringUtils.isNotBlank(value)) value.toBoolean else default } def getEvn(env:String) { EnvUtils.transformEnv(env); } } ================================================ FILE: fire-external/fire-apollo/src/main/scala/com/zto/fire/apollo/util/ApolloConstant.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.apollo.util object ApolloConstant { lazy val APOLLO_CONFIG_FILE = "apollo.properties" lazy val APOLLO_APP_ID = "app.id" lazy val APOLLO_META = "apollo.meta" lazy val APOLLO_ENV = "apollo.env" lazy val APOLLO_META_SUFFIX = ".meta" lazy val APOLLO_META_DEV = "dev.meta" } ================================================ FILE: fire-external/fire-apollo/src/test/scala/com/zto/fire/apollo/util/ApolloConfigUtilTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.apollo.util object ApolloConfigUtilTest { def main(args: Array[String]): Unit = { //使用说明:默认使用读取dev环境的配置 System.setProperty(ApolloConstant.APOLLO_ENV, "dev") println(ApolloConfigUtil.getProp) println(ApolloConfigUtil.getInt("test")) } } ================================================ FILE: fire-external/pom.xml ================================================ 4.0.0 fire-external pom fire-external com.zto.fire fire-parent 2.3.2-SNAPSHOT ../pom.xml fire-apollo com.zto.fire fire-common_${scala.binary.version} ${fire.version} ${maven.scope} com.zto.fire fire-core_${scala.binary.version} ${fire.version} ${maven.scope} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-metrics/pom.xml ================================================ 4.0.0 fire-metrics_${scala.binary.version} Fire : Metrics com.zto.fire fire-parent 2.3.2-SNAPSHOT ../pom.xml 3.1.5 4.7.1 io.dropwizard.metrics metrics-core ${codahale.metrics.version} io.dropwizard.metrics metrics-jvm ${codahale.metrics.version} io.dropwizard.metrics metrics-json ${codahale.metrics.version} io.dropwizard.metrics metrics-ganglia ${codahale.metrics.version} io.dropwizard.metrics metrics-graphite ${codahale.metrics.version} org.antlr antlr4-runtime ${antlr.version} org.slf4j slf4j-simple 1.7.25 org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-metrics/src/main/java/com/zto/fire/metrics/MetricsDemo.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.metrics import java.util.Random import java.util.concurrent.TimeUnit import com.codahale.metrics.jvm.{FileDescriptorRatioGauge, GarbageCollectorMetricSet, MemoryUsageGaugeSet, ThreadStatesGaugeSet} import com.codahale.metrics.{ConsoleReporter, MetricRegistry, Slf4jReporter} /** * Metrics模块测试 * * @author ChengLong * @since 2.0.0 * @create 2020-12-17 10:11 */ class MetricsDemo { val metrics = new MetricRegistry() // @Test def testMeter: Unit = { val reporter = ConsoleReporter.forRegistry(metrics).convertRatesTo(TimeUnit.SECONDS).convertDurationsTo(TimeUnit.MILLISECONDS).build reporter.start(1, TimeUnit.SECONDS) val requests = metrics.meter("requests") (1 to 100).foreach(i => { requests.mark() Thread.sleep(10) }) Thread.sleep(1000) } // @Test def testHistogram: Unit = { val reporter = ConsoleReporter.forRegistry(metrics).convertRatesTo(TimeUnit.SECONDS).convertDurationsTo(TimeUnit.MILLISECONDS).build reporter.start(1, TimeUnit.SECONDS) val reporter2 = Slf4jReporter.forRegistry(metrics).convertDurationsTo(TimeUnit.SECONDS).convertDurationsTo(TimeUnit.MILLISECONDS).withLoggingLevel(Slf4jReporter.LoggingLevel.ERROR).build reporter2.start(1, TimeUnit.SECONDS) val resultCounts = metrics.histogram(MetricRegistry.name(classOf[MetricsDemo], "result-counts")) val random = new Random() (1 to 1000).foreach(i => { resultCounts.update(random.nextInt(100)) Thread.sleep(10) }) Thread.sleep(1000) } // @Test def testJvm: Unit = { val reporter2 = ConsoleReporter.forRegistry(metrics) .convertRatesTo(TimeUnit.SECONDS) .convertDurationsTo(TimeUnit.MILLISECONDS) .build reporter2.start(3, TimeUnit.SECONDS) val reporter = Slf4jReporter.forRegistry(metrics).convertDurationsTo(TimeUnit.SECONDS).convertDurationsTo(TimeUnit.MILLISECONDS).withLoggingLevel(Slf4jReporter.LoggingLevel.ERROR).build reporter.start(5, TimeUnit.SECONDS) metrics.register("jvm.gc", new GarbageCollectorMetricSet()) metrics.register("jvm.memroy", new MemoryUsageGaugeSet()) metrics.register("jvm.thread-states", new ThreadStatesGaugeSet()) metrics.register("jvm.fd.usage", new FileDescriptorRatioGauge()) Thread.sleep(100000) } } ================================================ FILE: fire-metrics/src/test/java/com/zto/fire/jmx/Hello.java ================================================ package com.zto.fire.jmx; public class Hello implements HelloMBean { private final String name = "Reginald"; private int cacheSize = DEFAULT_CACHE_SIZE; private static final int DEFAULT_CACHE_SIZE = 200; public void sayHello() { System.out.println("hello, world"); } public int add(int x, int y) { return x + y; } public String getName() { return this.name; } public int getCacheSize() { return this.cacheSize; } public synchronized void setCacheSize(int size) { this.cacheSize = size; System.out.println("Cache size now " + this.cacheSize); } } ================================================ FILE: fire-metrics/src/test/java/com/zto/fire/jmx/HelloMBean.java ================================================ package com.zto.fire.jmx; public interface HelloMBean { public void sayHello(); public int add(int x, int y); public String getName(); public int getCacheSize(); public void setCacheSize(int size); } ================================================ FILE: fire-metrics/src/test/java/com/zto/fire/jmx/JmxApp.java ================================================ package com.zto.fire.jmx; import javax.management.MBeanServer; import javax.management.ObjectName; import java.lang.management.ManagementFactory; import java.util.Queue; import java.util.concurrent.ArrayBlockingQueue; public class JmxApp { public static void main(String[] args) throws Exception { // 最基本的MBean MBeanServer mbs = ManagementFactory.getPlatformMBeanServer(); ObjectName name = new ObjectName("com.zto.fire.jmx:type=Hello"); Hello mbean = new Hello(); mbs.registerMBean(mbean, name); // 复杂类型的MXBean ObjectName mxbeanName = new ObjectName("com.zto.fire.jmx:type=QueueSampler"); Queue queue = new ArrayBlockingQueue(10); queue.add("Request-1"); queue.add("Request-2"); queue.add("Request-3"); QueueSampler mxbean = new QueueSampler(queue); mbs.registerMBean(mxbean, mxbeanName); System.out.println("Waiting forever..."); Thread.sleep(Long.MAX_VALUE); } } ================================================ FILE: fire-metrics/src/test/java/com/zto/fire/jmx/QueueSample.java ================================================ package com.zto.fire.jmx; import java.beans.ConstructorProperties; import java.util.Date; public class QueueSample { private final Date date; private final int size; private final String head; @ConstructorProperties({"date", "size", "head"}) public QueueSample(Date date, int size, String head) { this.date = date; this.size = size; this.head = head; } public Date getDate() { return date; } public int getSize() { return size; } public String getHead() { return head; } } ================================================ FILE: fire-metrics/src/test/java/com/zto/fire/jmx/QueueSampler.java ================================================ package com.zto.fire.jmx; import java.util.Date; import java.util.Queue; public class QueueSampler implements QueueSamplerMXBean { private Queue queue; public QueueSampler(Queue queue) { this.queue = queue; } public QueueSample getQueueSample() { synchronized (queue) { return new QueueSample(new Date(), queue.size(), queue.peek()); } } public void clearQueue() { synchronized (queue) { queue.clear(); } } } ================================================ FILE: fire-metrics/src/test/java/com/zto/fire/jmx/QueueSamplerMXBean.java ================================================ package com.zto.fire.jmx; public interface QueueSamplerMXBean { public QueueSample getQueueSample(); public void clearQueue(); } ================================================ FILE: fire-metrics/src/test/scala/com.zto.fire.metrics/MetricsTest.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.metrics import java.util.Random import java.util.concurrent.TimeUnit import com.codahale.metrics.jvm.{FileDescriptorRatioGauge, GarbageCollectorMetricSet, MemoryUsageGaugeSet, ThreadStatesGaugeSet} import com.codahale.metrics.{ConsoleReporter, JmxReporter, MetricRegistry, Slf4jReporter} import org.antlr.v4.runtime.tree.ParseTreeWalker import org.antlr.v4.runtime.{CharStreams, CommonTokenStream} import org.junit.Test /** * Metrics模块测试 * 文档 * @author ChengLong * @since 2.0.0 * @create 2020-12-17 10:11 */ class MetricsTest { val metrics = new MetricRegistry() @Test def testMeter: Unit = { val reporter = ConsoleReporter.forRegistry(metrics).convertRatesTo(TimeUnit.SECONDS).convertDurationsTo(TimeUnit.MILLISECONDS).build reporter.start(1, TimeUnit.SECONDS) val requests = metrics.meter("requests") /*(1 to 100).foreach(i => { requests.mark() Thread.sleep(10) }) Thread.sleep(1000)*/ } @Test def testHistogram: Unit = { val jmxReporter = JmxReporter.forRegistry(metrics) .convertDurationsTo(TimeUnit.SECONDS) .convertRatesTo(TimeUnit.SECONDS) //.withLoggingLevel(Slf4jReporter.LoggingLevel.INFO) .build jmxReporter.start(/*3, TimeUnit.SECONDS*/) val consoleReporter = ConsoleReporter.forRegistry(metrics) .convertDurationsTo(TimeUnit.MILLISECONDS) .convertRatesTo(TimeUnit.SECONDS) //.withLoggingLevel(Slf4jReporter.LoggingLevel.INFO) .build consoleReporter.start(3, TimeUnit.SECONDS) val resultCounts = metrics.timer("result-counts") val resultCounts2 = metrics.meter("cost") val random = new Random() /*(1 to 10000).foreach(i => { // resultCounts.update(random.nextInt(100)) val start = System.currentTimeMillis() Thread.sleep(random.nextInt(50)) val end = System.currentTimeMillis() - start resultCounts.update(end, TimeUnit.MILLISECONDS) resultCounts2.mark() }) Thread.sleep(10000)*/ } @Test def testJvm: Unit = { val reporter2 = ConsoleReporter.forRegistry(metrics) .convertRatesTo(TimeUnit.SECONDS) .convertDurationsTo(TimeUnit.MILLISECONDS) .build reporter2.start(3, TimeUnit.SECONDS) val reporter = Slf4jReporter.forRegistry(metrics).convertDurationsTo(TimeUnit.SECONDS).convertDurationsTo(TimeUnit.MILLISECONDS).withLoggingLevel(Slf4jReporter.LoggingLevel.ERROR).build reporter.start(5, TimeUnit.SECONDS) metrics.register("jvm.gc", new GarbageCollectorMetricSet()) metrics.register("jvm.memroy", new MemoryUsageGaugeSet()) metrics.register("jvm.thread-states", new ThreadStatesGaugeSet()) metrics.register("jvm.fd.usage", new FileDescriptorRatioGauge()) // Thread.sleep(100000) } /*@Test def testAntlr: Unit = { val input = CharStreams.fromString( """ |a=(1+2+3)*10/5 |a |""".stripMargin) val lexer = new HelloLexer(input) val tokens = new CommonTokenStream(lexer) val parser = new HelloParser(tokens) val tree = parser.prog() val visitor = new HelloMyVisitor() visitor.visit(tree) } @Test def testArrayInit: Unit = { val input = CharStreams.fromString("{1,2,{3}}") val lexer = new ArrayInitLexer(input) val tokens = new CommonTokenStream(lexer) val parser = new ArrayInitParser(tokens) val tree = parser.init() val walker = new ParseTreeWalker walker.walk(new MyArrayInitListener(), tree) println() }*/ } ================================================ FILE: fire-platform/pom.xml ================================================ 4.0.0 fire-platform pom Fire : Platform : fire-parent com.zto.fire 2.3.2-SNAPSHOT org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-shell/flink-shell/pom.xml ================================================ 4.0.0 flink-shell_${flink.reference} Fire : Shell : Flink fire-shell com.zto.fire 2.3.2-SNAPSHOT com.zto.fire fire-flink_${flink.reference} ${fire.version} ${maven.scope} com.github.scopt scopt_${scala.binary.version} 3.5.0 org.apache.flink flink-clients_${scala.binary.version} ${flink.version} org.apache.flink flink-scala_${scala.binary.version} ${flink.version} org.apache.flink flink-streaming-scala_${scala.binary.version} ${flink.version} org.apache.flink flink-table-api-scala-bridge_${scala.binary.version} ${flink.version} org.apache.flink flink-test-utils_${scala.binary.version} ${flink.version} test org.apache.flink flink-java ${flink.version} org.apache.flink flink-queryable-state-client-java ${flink.version} org.apache.flink flink-statebackend-rocksdb_${scala.binary.version} ${flink.version} org.apache.flink flink-connector-kafka_${scala.binary.version} ${flink.version} org.apache.kafka kafka_${scala.binary.version} ${kafka.version} org.apache.flink flink-table-api-java-bridge_${scala.binary.version} ${flink.version} org.apache.flink flink-table-api-java ${flink.version} org.apache.flink flink-table-common ${flink.version} org.apache.flink flink-connector-hive_${scala.binary.version} ${flink.version} org.apache.flink flink-connector-jdbc_${scala.binary.version} ${flink.version} org.apache.flink flink-json ${flink.version} ${maven.scope} org.apache.flink flink-connector-elasticsearch-base_${scala.binary.version} ${flink.version} org.apache.flink flink-hadoop-compatibility_${scala.binary.version} ${flink.version} org.apache.rocketmq rocketmq-client ${rocketmq.version} org.apache.rocketmq rocketmq-acl ${rocketmq.version} org.apache.flink flink-orc-nohive_${scala.binary.version} ${flink.version} org.apache.flink flink-shaded-hadoop-2-uber 2.6.5-8.0 ${maven.scope} javax.servlet servlet-api org.apache.hive hive-exec ${hive.flink.version} ${maven.scope} calcite-core org.apache.calcite org.apache.hbase hbase-common ${hbase.version} ${maven.scope} org.apache.hbase hbase-client org.apache.hbase hbase-server ${hbase.version} ${maven.scope} org.apache.hbase hbase-client org.apache.hbase hbase-client ${hbase.version} ${maven.scope} calcite-core org.apache.calcite com.google.guava guava ${guava.version} ================================================ FILE: fire-shell/flink-shell/src/main/java/org/apache/flink/api/java/JarHelper.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.api.java; import java.io.*; import java.util.jar.JarEntry; import java.util.jar.JarInputStream; import java.util.jar.JarOutputStream; /** * Provides utility services for jarring and unjarring files and directories. Note that a given * instance of JarHelper is not threadsafe with respect to multiple jar operations. * *

Copied from * http://grepcode.com/file_/repo1.maven.org/maven2/org.apache.xmlbeans/xmlbeans/2.4.0/org/apache/xmlbeans/impl/common/JarHelper.java/?v=source * *

Author: Patrick Calahan pcal@bea.com */ public class JarHelper { // ======================================================================== // Constants private static final int BUFFER_SIZE = 2156; // ======================================================================== // Variables private byte[] mBuffer = new byte[BUFFER_SIZE]; private int mByteCount = 0; private boolean mVerbose = false; private String mDestJarName = ""; // ======================================================================== // Constructor /** Instantiates a new JarHelper. */ public JarHelper() {} // ======================================================================== // Public methods /** Jars a given directory or single file into a JarOutputStream. */ public void jarDir(File dirOrFile2Jar, File destJar) throws IOException { if (dirOrFile2Jar == null || destJar == null) { throw new IllegalArgumentException(); } mDestJarName = destJar.getCanonicalPath(); FileOutputStream fout = new FileOutputStream(destJar); JarOutputStream jout = new JarOutputStream(fout); // jout.setLevel(0); try { jarDir(dirOrFile2Jar, jout, null); } catch (IOException ioe) { throw ioe; } finally { jout.close(); fout.close(); } } /** Unjars a given jar file into a given directory. */ public void unjarDir(File jarFile, File destDir) throws IOException { FileInputStream fis = new FileInputStream(jarFile); unjar(fis, destDir); } /** Given an InputStream on a jar file, unjars the contents into the given directory. */ public void unjar(InputStream in, File destDir) throws IOException { JarInputStream jis = new JarInputStream(in); JarEntry entry; while ((entry = jis.getNextJarEntry()) != null) { if (entry.isDirectory()) { File dir = new File(destDir, entry.getName()); dir.mkdir(); if (entry.getTime() != -1) { dir.setLastModified(entry.getTime()); } continue; } int count; byte[] data = new byte[BUFFER_SIZE]; File destFile = new File(destDir, entry.getName()); if (mVerbose) { System.out.println("unjarring " + destFile + " from " + entry.getName()); } FileOutputStream fos = new FileOutputStream(destFile); BufferedOutputStream dest = new BufferedOutputStream(fos, BUFFER_SIZE); try { while ((count = jis.read(data, 0, BUFFER_SIZE)) != -1) { dest.write(data, 0, count); } dest.flush(); } finally { dest.close(); } if (entry.getTime() != -1) { destFile.setLastModified(entry.getTime()); } } jis.close(); } public void setVerbose(boolean b) { mVerbose = b; } // ======================================================================== // Private methods private static final char SEP = '/'; /** Recursively jars up the given path under the given directory. */ private void jarDir(File dirOrFile2jar, JarOutputStream jos, String path) throws IOException { if (mVerbose) { System.out.println("checking " + dirOrFile2jar); } if (dirOrFile2jar.isDirectory()) { String[] dirList = dirOrFile2jar.list(); String subPath = (path == null) ? "" : (path + dirOrFile2jar.getName() + SEP); if (path != null) { JarEntry je = new JarEntry(subPath); je.setTime(dirOrFile2jar.lastModified()); jos.putNextEntry(je); jos.flush(); jos.closeEntry(); } for (int i = 0; i < dirList.length; i++) { File f = new File(dirOrFile2jar, dirList[i]); jarDir(f, jos, subPath); } } else if (dirOrFile2jar.exists()) { if (dirOrFile2jar.getCanonicalPath().equals(mDestJarName)) { if (mVerbose) { System.out.println("skipping " + dirOrFile2jar.getPath()); } return; } if (mVerbose) { System.out.println("adding " + dirOrFile2jar.getPath()); } FileInputStream fis = new FileInputStream(dirOrFile2jar); try { JarEntry entry = new JarEntry(path + dirOrFile2jar.getName()); entry.setTime(dirOrFile2jar.lastModified()); jos.putNextEntry(entry); while ((mByteCount = fis.read(mBuffer)) != -1) { jos.write(mBuffer, 0, mByteCount); if (mVerbose) { System.out.println("wrote " + mByteCount + " bytes"); } } jos.flush(); jos.closeEntry(); } catch (IOException ioe) { throw ioe; } finally { fis.close(); } } } // for debugging public static void main(String[] args) throws IOException { if (args.length < 2) { System.err.println("Usage: JarHelper jarname.jar directory"); return; } JarHelper jarHelper = new JarHelper(); jarHelper.mVerbose = true; File destJar = new File(args[0]); File dirOrFile2Jar = new File(args[1]); jarHelper.jarDir(dirOrFile2Jar, destJar); } } ================================================ FILE: fire-shell/flink-shell/src/main/java/org/apache/flink/api/java/ScalaShellEnvironment.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.api.java; import com.zto.fire.shell.flink.FireILoop; import org.apache.flink.annotation.Internal; import org.apache.flink.api.common.InvalidProgramException; import org.apache.flink.configuration.ConfigUtils; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.DeploymentOptions; import org.apache.flink.configuration.PipelineOptions; import org.apache.flink.core.execution.JobClient; import org.apache.flink.util.JarUtils; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.List; import static org.apache.flink.util.Preconditions.checkNotNull; import static org.apache.flink.util.Preconditions.checkState; /** * Special version of {@link ExecutionEnvironment} that has a reference to * a {@link com.zto.fire.shell.flink.FireILoop}. When execute is called this will use the * reference of the ILoop to write the compiled classes of the current session to a Jar file and * submit these with the program. */ @Internal public class ScalaShellEnvironment extends ExecutionEnvironment { /** The jar files that need to be attached to each job. */ private final List jarFiles; /** reference to Scala Shell, for access to virtual directory. */ private final FireILoop fireILoop; public ScalaShellEnvironment( final Configuration configuration, final FireILoop fireILoop, final String... jarFiles) { super(validateAndGetConfiguration(configuration)); this.fireILoop = checkNotNull(fireILoop); this.jarFiles = checkNotNull(JarUtils.getJarFiles(jarFiles)); } private static Configuration validateAndGetConfiguration(final Configuration configuration) { if (!ExecutionEnvironment.areExplicitEnvironmentsAllowed()) { throw new InvalidProgramException( "The RemoteEnvironment cannot be instantiated when running in a pre-defined context " + "(such as Command Line Client, Scala Shell, or TestEnvironment)"); } return checkNotNull(configuration); } @Override public JobClient executeAsync(String jobName) throws Exception { updateDependencies(); return super.executeAsync(jobName); } private void updateDependencies() throws Exception { final Configuration configuration = getConfiguration(); checkState( configuration.getBoolean(DeploymentOptions.ATTACHED), "Only ATTACHED mode is supported by the scala shell."); final List updatedJarFiles = getUpdatedJarFiles(); ConfigUtils.encodeCollectionToConfig( configuration, PipelineOptions.JARS, updatedJarFiles, URL::toString); } private List getUpdatedJarFiles() throws MalformedURLException { final URL jarUrl = fireILoop.writeFilesToDisk().getAbsoluteFile().toURI().toURL(); final List allJarFiles = new ArrayList<>(jarFiles); allJarFiles.add(jarUrl); return allJarFiles; } public static void disableAllContextAndOtherEnvironments() { initializeContextEnvironment( () -> { throw new UnsupportedOperationException( "Execution Environment is already defined for this shell."); }); } public static void resetContextEnvironments() { ExecutionEnvironment.resetContextEnvironment(); } } ================================================ FILE: fire-shell/flink-shell/src/main/java/org/apache/flink/api/java/ScalaShellStreamEnvironment.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.api.java; import com.zto.fire.shell.flink.FireILoop; import org.apache.flink.annotation.Internal; import org.apache.flink.api.common.InvalidProgramException; import org.apache.flink.configuration.ConfigUtils; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.DeploymentOptions; import org.apache.flink.configuration.PipelineOptions; import org.apache.flink.core.execution.JobClient; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.graph.StreamGraph; import org.apache.flink.util.JarUtils; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.List; import static org.apache.flink.util.Preconditions.checkNotNull; import static org.apache.flink.util.Preconditions.checkState; /** A {@link StreamExecutionEnvironment} for the Scala shell. */ @Internal public class ScalaShellStreamEnvironment extends StreamExecutionEnvironment { /** The jar files that need to be attached to each job. */ private final List jarFiles; /** reference to Scala Shell, for access to virtual directory. */ private final FireILoop fireILoop; public ScalaShellStreamEnvironment( final Configuration configuration, final FireILoop fireILoop, final String... jarFiles) { super(validateAndGetConfiguration(configuration)); this.fireILoop = checkNotNull(fireILoop); this.jarFiles = checkNotNull(JarUtils.getJarFiles(jarFiles)); } private static Configuration validateAndGetConfiguration(final Configuration configuration) { if (!ExecutionEnvironment.areExplicitEnvironmentsAllowed()) { throw new InvalidProgramException( "The RemoteEnvironment cannot be used when submitting a program through a client, " + "or running in a TestEnvironment context."); } return checkNotNull(configuration); } @Override public JobClient executeAsync(StreamGraph streamGraph) throws Exception { updateDependencies(); return super.executeAsync(streamGraph); } private void updateDependencies() throws Exception { checkState( configuration.getBoolean(DeploymentOptions.ATTACHED), "Only ATTACHED mode is supported by the scala shell."); final List updatedJarFiles = getUpdatedJarFiles(); ConfigUtils.encodeCollectionToConfig( configuration, PipelineOptions.JARS, updatedJarFiles, URL::toString); } public Configuration getClientConfiguration() { return configuration; } private List getUpdatedJarFiles() throws MalformedURLException { final URL jarUrl = fireILoop.writeFilesToDisk().getAbsoluteFile().toURI().toURL(); final List allJarFiles = new ArrayList<>(jarFiles); allJarFiles.add(jarUrl); return allJarFiles; } public static void disableAllContextAndOtherEnvironments() { initializeContextEnvironment( configuration -> { throw new UnsupportedOperationException( "Execution Environment is already defined for this shell."); }); } public static void resetContextEnvironments() { StreamExecutionEnvironment.resetContextEnvironment(); } } ================================================ FILE: fire-shell/flink-shell/src/main/java-flink-1.12/org.apache.flink.streaming.api.environment/StreamExecutionEnvironment.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.streaming.api.environment; import org.apache.flink.annotation.Experimental; import org.apache.flink.annotation.Internal; import org.apache.flink.annotation.Public; import org.apache.flink.annotation.PublicEvolving; import org.apache.flink.api.common.ExecutionConfig; import org.apache.flink.api.common.InvalidProgramException; import org.apache.flink.api.common.JobExecutionResult; import org.apache.flink.api.common.RuntimeExecutionMode; import org.apache.flink.api.common.cache.DistributedCache; import org.apache.flink.api.common.eventtime.WatermarkStrategy; import org.apache.flink.api.common.functions.InvalidTypesException; import org.apache.flink.api.common.io.FileInputFormat; import org.apache.flink.api.common.io.FilePathFilter; import org.apache.flink.api.common.io.InputFormat; import org.apache.flink.api.common.restartstrategy.RestartStrategies; import org.apache.flink.api.common.time.Time; import org.apache.flink.api.common.typeinfo.BasicTypeInfo; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.connector.source.Boundedness; import org.apache.flink.api.connector.source.Source; import org.apache.flink.api.connector.source.lib.NumberSequenceSource; import org.apache.flink.api.dag.Transformation; import org.apache.flink.api.java.ClosureCleaner; import org.apache.flink.api.java.Utils; import org.apache.flink.api.java.io.TextInputFormat; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.tuple.Tuple3; import org.apache.flink.api.java.typeutils.MissingTypeInfo; import org.apache.flink.api.java.typeutils.PojoTypeInfo; import org.apache.flink.api.java.typeutils.ResultTypeQueryable; import org.apache.flink.api.java.typeutils.TypeExtractor; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.CoreOptions; import org.apache.flink.configuration.DeploymentOptions; import org.apache.flink.configuration.ExecutionOptions; import org.apache.flink.configuration.PipelineOptions; import org.apache.flink.configuration.ReadableConfig; import org.apache.flink.configuration.RestOptions; import org.apache.flink.core.execution.DefaultExecutorServiceLoader; import org.apache.flink.core.execution.DetachedJobExecutionResult; import org.apache.flink.core.execution.JobClient; import org.apache.flink.core.execution.JobListener; import org.apache.flink.core.execution.PipelineExecutor; import org.apache.flink.core.execution.PipelineExecutorFactory; import org.apache.flink.core.execution.PipelineExecutorServiceLoader; import org.apache.flink.core.fs.Path; import org.apache.flink.runtime.state.KeyGroupRangeAssignment; import org.apache.flink.runtime.state.StateBackend; import org.apache.flink.runtime.state.StateBackendLoader; import org.apache.flink.streaming.api.CheckpointingMode; import org.apache.flink.streaming.api.TimeCharacteristic; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction; import org.apache.flink.streaming.api.functions.source.ContinuousFileReaderOperatorFactory; import org.apache.flink.streaming.api.functions.source.FileMonitoringFunction; import org.apache.flink.streaming.api.functions.source.FileProcessingMode; import org.apache.flink.streaming.api.functions.source.FileReadFunction; import org.apache.flink.streaming.api.functions.source.FromElementsFunction; import org.apache.flink.streaming.api.functions.source.FromIteratorFunction; import org.apache.flink.streaming.api.functions.source.FromSplittableIteratorFunction; import org.apache.flink.streaming.api.functions.source.InputFormatSourceFunction; import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction; import org.apache.flink.streaming.api.functions.source.SocketTextStreamFunction; import org.apache.flink.streaming.api.functions.source.SourceFunction; import org.apache.flink.streaming.api.functions.source.StatefulSequenceSource; import org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit; import org.apache.flink.streaming.api.graph.StreamGraph; import org.apache.flink.streaming.api.graph.StreamGraphGenerator; import org.apache.flink.streaming.api.graph.StreamingJobGraphGenerator; import org.apache.flink.streaming.api.operators.StreamSource; import org.apache.flink.util.DynamicCodeLoadingException; import org.apache.flink.util.ExceptionUtils; import org.apache.flink.util.FlinkException; import org.apache.flink.util.InstantiationUtil; import org.apache.flink.util.Preconditions; import org.apache.flink.util.SplittableIterator; import org.apache.flink.util.StringUtils; import org.apache.flink.util.WrappingRuntimeException; import com.esotericsoftware.kryo.Serializer; import javax.annotation.Nullable; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; import static org.apache.flink.util.Preconditions.checkNotNull; /** * The StreamExecutionEnvironment is the context in which a streaming program is executed. A {@link * LocalStreamEnvironment} will cause execution in the current JVM, a {@link * RemoteStreamEnvironment} will cause execution on a remote setup. * *

The environment provides methods to control the job execution (such as setting the parallelism * or the fault tolerance/checkpointing parameters) and to interact with the outside world (data * access). * * @see org.apache.flink.streaming.api.environment.LocalStreamEnvironment * @see org.apache.flink.streaming.api.environment.RemoteStreamEnvironment */ @Public public class StreamExecutionEnvironment { /** The default name to use for a streaming job if no other name has been specified. */ public static final String DEFAULT_JOB_NAME = "Flink Streaming Job"; /** The time characteristic that is used if none other is set. */ private static final TimeCharacteristic DEFAULT_TIME_CHARACTERISTIC = TimeCharacteristic.EventTime; /** * The environment of the context (local by default, cluster if invoked through command line). */ private static StreamExecutionEnvironmentFactory contextEnvironmentFactory = null; /** The ThreadLocal used to store {@link StreamExecutionEnvironmentFactory}. */ private static final ThreadLocal threadLocalContextEnvironmentFactory = new ThreadLocal<>(); /** The default parallelism used when creating a local environment. */ private static int defaultLocalParallelism = Runtime.getRuntime().availableProcessors(); // ------------------------------------------------------------------------ /** The execution configuration for this environment. */ private final ExecutionConfig config = new ExecutionConfig(); /** Settings that control the checkpointing behavior. */ private final CheckpointConfig checkpointCfg = new CheckpointConfig(); protected final List> transformations = new ArrayList<>(); private long bufferTimeout = StreamingJobGraphGenerator.UNDEFINED_NETWORK_BUFFER_TIMEOUT; protected boolean isChainingEnabled = true; /** The state backend used for storing k/v state and state snapshots. */ private StateBackend defaultStateBackend; /** The time characteristic used by the data streams. */ private TimeCharacteristic timeCharacteristic = DEFAULT_TIME_CHARACTERISTIC; protected final List> cacheFile = new ArrayList<>(); private final PipelineExecutorServiceLoader executorServiceLoader; // TODO: ------------ start:二次开发代码 --------------- // protected final Configuration configuration; // TODO: ------------ end:二次开发代码 --------------- // private final ClassLoader userClassloader; private final List jobListeners = new ArrayList<>(); // -------------------------------------------------------------------------------------------- // Constructor and Properties // -------------------------------------------------------------------------------------------- public StreamExecutionEnvironment() { this(new Configuration()); // unfortunately, StreamExecutionEnvironment always (implicitly) had a public constructor. // This constructor is not useful because the execution environment cannot be used for // execution. We're keeping this to appease the binary compatibiliy checks. } /** * Creates a new {@link StreamExecutionEnvironment} that will use the given {@link * Configuration} to configure the {@link PipelineExecutor}. */ @PublicEvolving public StreamExecutionEnvironment(final Configuration configuration) { this(configuration, null); } /** * Creates a new {@link StreamExecutionEnvironment} that will use the given {@link * Configuration} to configure the {@link PipelineExecutor}. * *

In addition, this constructor allows specifying the user code {@link ClassLoader}. */ @PublicEvolving public StreamExecutionEnvironment( final Configuration configuration, final ClassLoader userClassloader) { this(new DefaultExecutorServiceLoader(), configuration, userClassloader); } /** * Creates a new {@link StreamExecutionEnvironment} that will use the given {@link * Configuration} to configure the {@link PipelineExecutor}. * *

In addition, this constructor allows specifying the {@link PipelineExecutorServiceLoader} * and user code {@link ClassLoader}. */ @PublicEvolving public StreamExecutionEnvironment( final PipelineExecutorServiceLoader executorServiceLoader, final Configuration configuration, final ClassLoader userClassloader) { this.executorServiceLoader = checkNotNull(executorServiceLoader); this.configuration = new Configuration(checkNotNull(configuration)); this.userClassloader = userClassloader == null ? getClass().getClassLoader() : userClassloader; // the configuration of a job or an operator can be specified at the following places: // i) at the operator level using e.g. parallelism using the // SingleOutputStreamOperator.setParallelism(). // ii) programmatically by using e.g. the env.setRestartStrategy() method // iii) in the configuration passed here // // if specified in multiple places, the priority order is the above. // // Given this, it is safe to overwrite the execution config default values here because all // other ways assume // that the env is already instantiated so they will overwrite the value passed here. this.configure(this.configuration, this.userClassloader); } protected Configuration getConfiguration() { return this.configuration; } protected ClassLoader getUserClassloader() { return userClassloader; } /** Gets the config object. */ public ExecutionConfig getConfig() { return config; } /** * Get the list of cached files that were registered for distribution among the task managers. */ public List> getCachedFiles() { return cacheFile; } /** Gets the config JobListeners. */ @PublicEvolving public List getJobListeners() { return jobListeners; } /** * Sets the parallelism for operations executed through this environment. Setting a parallelism * of x here will cause all operators (such as map, batchReduce) to run with x parallel * instances. This method overrides the default parallelism for this environment. The {@link * LocalStreamEnvironment} uses by default a value equal to the number of hardware contexts (CPU * cores / threads). When executing the program via the command line client from a JAR file, the * default degree of parallelism is the one configured for that setup. * * @param parallelism The parallelism */ public StreamExecutionEnvironment setParallelism(int parallelism) { config.setParallelism(parallelism); return this; } /** * Sets the runtime execution mode for the application (see {@link RuntimeExecutionMode}). This * is equivalent to setting the {@code execution.runtime-mode} in your application's * configuration file. * *

We recommend users to NOT use this method but set the {@code execution.runtime-mode} using * the command-line when submitting the application. Keeping the application code * configuration-free allows for more flexibility as the same application will be able to be * executed in any execution mode. * * @param executionMode the desired execution mode. * @return The execution environment of your application. */ @PublicEvolving public StreamExecutionEnvironment setRuntimeMode(final RuntimeExecutionMode executionMode) { checkNotNull(executionMode); configuration.set(ExecutionOptions.RUNTIME_MODE, executionMode); return this; } /** * Sets the maximum degree of parallelism defined for the program. The upper limit (inclusive) * is Short.MAX_VALUE. * *

The maximum degree of parallelism specifies the upper limit for dynamic scaling. It also * defines the number of key groups used for partitioned state. * * @param maxParallelism Maximum degree of parallelism to be used for the program., with {@code * 0 < maxParallelism <= 2^15 - 1}. */ public StreamExecutionEnvironment setMaxParallelism(int maxParallelism) { Preconditions.checkArgument( maxParallelism > 0 && maxParallelism <= KeyGroupRangeAssignment.UPPER_BOUND_MAX_PARALLELISM, "maxParallelism is out of bounds 0 < maxParallelism <= " + KeyGroupRangeAssignment.UPPER_BOUND_MAX_PARALLELISM + ". Found: " + maxParallelism); config.setMaxParallelism(maxParallelism); return this; } /** * Gets the parallelism with which operation are executed by default. Operations can * individually override this value to use a specific parallelism. * * @return The parallelism used by operations, unless they override that value. */ public int getParallelism() { return config.getParallelism(); } /** * Gets the maximum degree of parallelism defined for the program. * *

The maximum degree of parallelism specifies the upper limit for dynamic scaling. It also * defines the number of key groups used for partitioned state. * * @return Maximum degree of parallelism */ public int getMaxParallelism() { return config.getMaxParallelism(); } /** * Sets the maximum time frequency (milliseconds) for the flushing of the output buffers. By * default the output buffers flush frequently to provide low latency and to aid smooth * developer experience. Setting the parameter can result in three logical modes: * *

    *
  • A positive integer triggers flushing periodically by that integer *
  • 0 triggers flushing after every record thus minimizing latency *
  • -1 triggers flushing only when the output buffer is full thus maximizing throughput *
* * @param timeoutMillis The maximum time between two output flushes. */ public StreamExecutionEnvironment setBufferTimeout(long timeoutMillis) { if (timeoutMillis < -1) { throw new IllegalArgumentException("Timeout of buffer must be non-negative or -1"); } this.bufferTimeout = timeoutMillis; return this; } /** * Gets the maximum time frequency (milliseconds) for the flushing of the output buffers. For * clarification on the extremal values see {@link #setBufferTimeout(long)}. * * @return The timeout of the buffer. */ public long getBufferTimeout() { return this.bufferTimeout; } /** * Disables operator chaining for streaming operators. Operator chaining allows non-shuffle * operations to be co-located in the same thread fully avoiding serialization and * de-serialization. * * @return StreamExecutionEnvironment with chaining disabled. */ @PublicEvolving public StreamExecutionEnvironment disableOperatorChaining() { this.isChainingEnabled = false; return this; } /** * Returns whether operator chaining is enabled. * * @return {@code true} if chaining is enabled, false otherwise. */ @PublicEvolving public boolean isChainingEnabled() { return isChainingEnabled; } // ------------------------------------------------------------------------ // Checkpointing Settings // ------------------------------------------------------------------------ /** * Gets the checkpoint config, which defines values like checkpoint interval, delay between * checkpoints, etc. * * @return The checkpoint config. */ public CheckpointConfig getCheckpointConfig() { return checkpointCfg; } /** * Enables checkpointing for the streaming job. The distributed state of the streaming dataflow * will be periodically snapshotted. In case of a failure, the streaming dataflow will be * restarted from the latest completed checkpoint. This method selects {@link * CheckpointingMode#EXACTLY_ONCE} guarantees. * *

The job draws checkpoints periodically, in the given interval. The state will be stored in * the configured state backend. * *

NOTE: Checkpointing iterative streaming dataflows in not properly supported at the moment. * For that reason, iterative jobs will not be started if used with enabled checkpointing. To * override this mechanism, use the {@link #enableCheckpointing(long, CheckpointingMode, * boolean)} method. * * @param interval Time interval between state checkpoints in milliseconds. */ public StreamExecutionEnvironment enableCheckpointing(long interval) { checkpointCfg.setCheckpointInterval(interval); return this; } /** * Enables checkpointing for the streaming job. The distributed state of the streaming dataflow * will be periodically snapshotted. In case of a failure, the streaming dataflow will be * restarted from the latest completed checkpoint. * *

The job draws checkpoints periodically, in the given interval. The system uses the given * {@link CheckpointingMode} for the checkpointing ("exactly once" vs "at least once"). The * state will be stored in the configured state backend. * *

NOTE: Checkpointing iterative streaming dataflows in not properly supported at the moment. * For that reason, iterative jobs will not be started if used with enabled checkpointing. To * override this mechanism, use the {@link #enableCheckpointing(long, CheckpointingMode, * boolean)} method. * * @param interval Time interval between state checkpoints in milliseconds. * @param mode The checkpointing mode, selecting between "exactly once" and "at least once" * guaranteed. */ public StreamExecutionEnvironment enableCheckpointing(long interval, CheckpointingMode mode) { checkpointCfg.setCheckpointingMode(mode); checkpointCfg.setCheckpointInterval(interval); return this; } /** * Enables checkpointing for the streaming job. The distributed state of the streaming dataflow * will be periodically snapshotted. In case of a failure, the streaming dataflow will be * restarted from the latest completed checkpoint. * *

The job draws checkpoints periodically, in the given interval. The state will be stored in * the configured state backend. * *

NOTE: Checkpointing iterative streaming dataflows in not properly supported at the moment. * If the "force" parameter is set to true, the system will execute the job nonetheless. * * @param interval Time interval between state checkpoints in millis. * @param mode The checkpointing mode, selecting between "exactly once" and "at least once" * guaranteed. * @param force If true checkpointing will be enabled for iterative jobs as well. * @deprecated Use {@link #enableCheckpointing(long, CheckpointingMode)} instead. Forcing * checkpoints will be removed in the future. */ @Deprecated @SuppressWarnings("deprecation") @PublicEvolving public StreamExecutionEnvironment enableCheckpointing( long interval, CheckpointingMode mode, boolean force) { checkpointCfg.setCheckpointingMode(mode); checkpointCfg.setCheckpointInterval(interval); checkpointCfg.setForceCheckpointing(force); return this; } /** * Enables checkpointing for the streaming job. The distributed state of the streaming dataflow * will be periodically snapshotted. In case of a failure, the streaming dataflow will be * restarted from the latest completed checkpoint. This method selects {@link * CheckpointingMode#EXACTLY_ONCE} guarantees. * *

The job draws checkpoints periodically, in the default interval. The state will be stored * in the configured state backend. * *

NOTE: Checkpointing iterative streaming dataflows in not properly supported at the moment. * For that reason, iterative jobs will not be started if used with enabled checkpointing. To * override this mechanism, use the {@link #enableCheckpointing(long, CheckpointingMode, * boolean)} method. * * @deprecated Use {@link #enableCheckpointing(long)} instead. */ @Deprecated @PublicEvolving public StreamExecutionEnvironment enableCheckpointing() { checkpointCfg.setCheckpointInterval(500); return this; } /** * Returns the checkpointing interval or -1 if checkpointing is disabled. * *

Shorthand for {@code getCheckpointConfig().getCheckpointInterval()}. * * @return The checkpointing interval or -1 */ public long getCheckpointInterval() { return checkpointCfg.getCheckpointInterval(); } /** * Returns whether checkpointing is force-enabled. * * @deprecated Forcing checkpoints will be removed in future version. */ @Deprecated @SuppressWarnings("deprecation") @PublicEvolving public boolean isForceCheckpointing() { return checkpointCfg.isForceCheckpointing(); } /** Returns whether Unaligned Checkpoints are enabled. */ @PublicEvolving public boolean isUnalignedCheckpointsEnabled() { return checkpointCfg.isUnalignedCheckpointsEnabled(); } /** Returns whether Unaligned Checkpoints are force-enabled. */ @PublicEvolving public boolean isForceUnalignedCheckpoints() { return checkpointCfg.isForceUnalignedCheckpoints(); } /** * Returns the checkpointing mode (exactly-once vs. at-least-once). * *

Shorthand for {@code getCheckpointConfig().getCheckpointingMode()}. * * @return The checkpoint mode */ public CheckpointingMode getCheckpointingMode() { return checkpointCfg.getCheckpointingMode(); } /** * Sets the state backend that describes how to store and checkpoint operator state. It defines * both which data structures hold state during execution (for example hash tables, RockDB, or * other data stores) as well as where checkpointed data will be persisted. * *

State managed by the state backend includes both keyed state that is accessible on {@link * org.apache.flink.streaming.api.datastream.KeyedStream keyed streams}, as well as state * maintained directly by the user code that implements {@link * org.apache.flink.streaming.api.checkpoint.CheckpointedFunction CheckpointedFunction}. * *

The {@link org.apache.flink.runtime.state.memory.MemoryStateBackend} for example maintains * the state in heap memory, as objects. It is lightweight without extra dependencies, but can * checkpoint only small states (some counters). * *

In contrast, the {@link org.apache.flink.runtime.state.filesystem.FsStateBackend} stores * checkpoints of the state (also maintained as heap objects) in files. When using a replicated * file system (like HDFS, S3, MapR FS, Alluxio, etc) this will guarantee that state is not lost * upon failures of individual nodes and that streaming program can be executed highly available * and strongly consistent (assuming that Flink is run in high-availability mode). * * @return This StreamExecutionEnvironment itself, to allow chaining of function calls. * @see #getStateBackend() */ @PublicEvolving public StreamExecutionEnvironment setStateBackend(StateBackend backend) { this.defaultStateBackend = Preconditions.checkNotNull(backend); return this; } /** * Gets the state backend that defines how to store and checkpoint state. * * @see #setStateBackend(StateBackend) */ @PublicEvolving public StateBackend getStateBackend() { return defaultStateBackend; } /** * Sets the restart strategy configuration. The configuration specifies which restart strategy * will be used for the execution graph in case of a restart. * * @param restartStrategyConfiguration Restart strategy configuration to be set */ @PublicEvolving public void setRestartStrategy( RestartStrategies.RestartStrategyConfiguration restartStrategyConfiguration) { config.setRestartStrategy(restartStrategyConfiguration); } /** * Returns the specified restart strategy configuration. * * @return The restart strategy configuration to be used */ @PublicEvolving public RestartStrategies.RestartStrategyConfiguration getRestartStrategy() { return config.getRestartStrategy(); } /** * Sets the number of times that failed tasks are re-executed. A value of zero effectively * disables fault tolerance. A value of {@code -1} indicates that the system default value (as * defined in the configuration) should be used. * * @param numberOfExecutionRetries The number of times the system will try to re-execute failed * tasks. * @deprecated This method will be replaced by {@link #setRestartStrategy}. The {@link * RestartStrategies#fixedDelayRestart(int, Time)} contains the number of execution retries. */ @Deprecated @PublicEvolving public void setNumberOfExecutionRetries(int numberOfExecutionRetries) { config.setNumberOfExecutionRetries(numberOfExecutionRetries); } /** * Gets the number of times the system will try to re-execute failed tasks. A value of {@code * -1} indicates that the system default value (as defined in the configuration) should be used. * * @return The number of times the system will try to re-execute failed tasks. * @deprecated This method will be replaced by {@link #getRestartStrategy}. */ @Deprecated @PublicEvolving public int getNumberOfExecutionRetries() { return config.getNumberOfExecutionRetries(); } // -------------------------------------------------------------------------------------------- // Registry for types and serializers // -------------------------------------------------------------------------------------------- /** * Adds a new Kryo default serializer to the Runtime. * *

Note that the serializer instance must be serializable (as defined by * java.io.Serializable), because it may be distributed to the worker nodes by java * serialization. * * @param type The class of the types serialized with the given serializer. * @param serializer The serializer to use. */ public & Serializable> void addDefaultKryoSerializer( Class type, T serializer) { config.addDefaultKryoSerializer(type, serializer); } /** * Adds a new Kryo default serializer to the Runtime. * * @param type The class of the types serialized with the given serializer. * @param serializerClass The class of the serializer to use. */ public void addDefaultKryoSerializer( Class type, Class> serializerClass) { config.addDefaultKryoSerializer(type, serializerClass); } /** * Registers the given type with a Kryo Serializer. * *

Note that the serializer instance must be serializable (as defined by * java.io.Serializable), because it may be distributed to the worker nodes by java * serialization. * * @param type The class of the types serialized with the given serializer. * @param serializer The serializer to use. */ public & Serializable> void registerTypeWithKryoSerializer( Class type, T serializer) { config.registerTypeWithKryoSerializer(type, serializer); } /** * Registers the given Serializer via its class as a serializer for the given type at the * KryoSerializer. * * @param type The class of the types serialized with the given serializer. * @param serializerClass The class of the serializer to use. */ @SuppressWarnings("rawtypes") public void registerTypeWithKryoSerializer( Class type, Class serializerClass) { config.registerTypeWithKryoSerializer(type, serializerClass); } /** * Registers the given type with the serialization stack. If the type is eventually serialized * as a POJO, then the type is registered with the POJO serializer. If the type ends up being * serialized with Kryo, then it will be registered at Kryo to make sure that only tags are * written. * * @param type The class of the type to register. */ public void registerType(Class type) { if (type == null) { throw new NullPointerException("Cannot register null type class."); } TypeInformation typeInfo = TypeExtractor.createTypeInfo(type); if (typeInfo instanceof PojoTypeInfo) { config.registerPojoType(type); } else { config.registerKryoType(type); } } // -------------------------------------------------------------------------------------------- // Time characteristic // -------------------------------------------------------------------------------------------- /** * Sets the time characteristic for all streams create from this environment, e.g., processing * time, event time, or ingestion time. * *

If you set the characteristic to IngestionTime of EventTime this will set a default * watermark update interval of 200 ms. If this is not applicable for your application you * should change it using {@link ExecutionConfig#setAutoWatermarkInterval(long)}. * * @param characteristic The time characteristic. * @deprecated In Flink 1.12 the default stream time characteristic has been changed to {@link * TimeCharacteristic#EventTime}, thus you don't need to call this method for enabling * event-time support anymore. Explicitly using processing-time windows and timers works in * event-time mode. If you need to disable watermarks, please use {@link * ExecutionConfig#setAutoWatermarkInterval(long)}. If you are using {@link * TimeCharacteristic#IngestionTime}, please manually set an appropriate {@link * WatermarkStrategy}. If you are using generic "time window" operations (for example {@link * org.apache.flink.streaming.api.datastream.KeyedStream#timeWindow(org.apache.flink.streaming.api.windowing.time.Time)} * that change behaviour based on the time characteristic, please use equivalent operations * that explicitly specify processing time or event time. */ @PublicEvolving @Deprecated public void setStreamTimeCharacteristic(TimeCharacteristic characteristic) { this.timeCharacteristic = Preconditions.checkNotNull(characteristic); if (characteristic == TimeCharacteristic.ProcessingTime) { getConfig().setAutoWatermarkInterval(0); } else { getConfig().setAutoWatermarkInterval(200); } } /** * Gets the time characteristic. * * @deprecated See {@link #setStreamTimeCharacteristic(TimeCharacteristic)} for deprecation * notice. */ @PublicEvolving @Deprecated public TimeCharacteristic getStreamTimeCharacteristic() { return timeCharacteristic; } /** * Sets all relevant options contained in the {@link ReadableConfig} such as e.g. {@link * StreamPipelineOptions#TIME_CHARACTERISTIC}. It will reconfigure {@link * StreamExecutionEnvironment}, {@link ExecutionConfig} and {@link CheckpointConfig}. * *

It will change the value of a setting only if a corresponding option was set in the {@code * configuration}. If a key is not present, the current value of a field will remain untouched. * * @param configuration a configuration to read the values from * @param classLoader a class loader to use when loading classes */ @PublicEvolving public void configure(ReadableConfig configuration, ClassLoader classLoader) { configuration .getOptional(StreamPipelineOptions.TIME_CHARACTERISTIC) .ifPresent(this::setStreamTimeCharacteristic); Optional.ofNullable(loadStateBackend(configuration, classLoader)) .ifPresent(this::setStateBackend); configuration .getOptional(PipelineOptions.OPERATOR_CHAINING) .ifPresent(c -> this.isChainingEnabled = c); configuration .getOptional(ExecutionOptions.BUFFER_TIMEOUT) .ifPresent(t -> this.setBufferTimeout(t.toMillis())); configuration .getOptional(DeploymentOptions.JOB_LISTENERS) .ifPresent(listeners -> registerCustomListeners(classLoader, listeners)); configuration .getOptional(PipelineOptions.CACHED_FILES) .ifPresent( f -> { this.cacheFile.clear(); this.cacheFile.addAll(DistributedCache.parseCachedFilesFromString(f)); }); configuration .getOptional(ExecutionOptions.RUNTIME_MODE) .ifPresent( runtimeMode -> this.configuration.set(ExecutionOptions.RUNTIME_MODE, runtimeMode)); configuration .getOptional(ExecutionOptions.SORT_INPUTS) .ifPresent( sortInputs -> this.getConfiguration() .set(ExecutionOptions.SORT_INPUTS, sortInputs)); configuration .getOptional(ExecutionOptions.USE_BATCH_STATE_BACKEND) .ifPresent( sortInputs -> this.getConfiguration() .set(ExecutionOptions.USE_BATCH_STATE_BACKEND, sortInputs)); configuration .getOptional(PipelineOptions.NAME) .ifPresent(jobName -> this.getConfiguration().set(PipelineOptions.NAME, jobName)); config.configure(configuration, classLoader); checkpointCfg.configure(configuration); } private void registerCustomListeners( final ClassLoader classLoader, final List listeners) { for (String listener : listeners) { try { final JobListener jobListener = InstantiationUtil.instantiate(listener, JobListener.class, classLoader); jobListeners.add(jobListener); } catch (FlinkException e) { throw new WrappingRuntimeException("Could not load JobListener : " + listener, e); } } } private StateBackend loadStateBackend(ReadableConfig configuration, ClassLoader classLoader) { try { return StateBackendLoader.loadStateBackendFromConfig(configuration, classLoader, null); } catch (DynamicCodeLoadingException | IOException e) { throw new WrappingRuntimeException(e); } } // -------------------------------------------------------------------------------------------- // Data stream creations // -------------------------------------------------------------------------------------------- /** * Creates a new data stream that contains a sequence of numbers. This is a parallel source, if * you manually set the parallelism to {@code 1} (using {@link * org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator#setParallelism(int)}) * the generated sequence of elements is in order. * * @param from The number to start at (inclusive) * @param to The number to stop at (inclusive) * @return A data stream, containing all number in the [from, to] interval * @deprecated Use {@link #fromSequence(long, long)} instead to create a new data stream that * contains {@link org.apache.flink.api.connector.source.lib.NumberSequenceSource}. */ @Deprecated public DataStreamSource generateSequence(long from, long to) { if (from > to) { throw new IllegalArgumentException( "Start of sequence must not be greater than the end"); } return addSource(new StatefulSequenceSource(from, to), "Sequence Source (Deprecated)"); } /** * Creates a new data stream that contains a sequence of numbers (longs) and is useful for * testing and for cases that just need a stream of N events of any kind. * *

The generated source splits the sequence into as many parallel sub-sequences as there are * parallel source readers. Each sub-sequence will be produced in order. If the parallelism is * limited to one, the source will produce one sequence in order. * *

This source is always bounded. For very long sequences (for example over the entire domain * of long integer values), you may consider executing the application in a streaming manner * because of the end bound that is pretty far away. * *

Use {@link #fromSource(Source, WatermarkStrategy, String)} together with {@link * NumberSequenceSource} if you required more control over the created sources. For example, if * you want to set a {@link WatermarkStrategy}. * * @param from The number to start at (inclusive) * @param to The number to stop at (inclusive) */ public DataStreamSource fromSequence(long from, long to) { if (from > to) { throw new IllegalArgumentException( "Start of sequence must not be greater than the end"); } return fromSource( new NumberSequenceSource(from, to), WatermarkStrategy.noWatermarks(), "Sequence Source"); } /** * Creates a new data stream that contains the given elements. The elements must all be of the * same type, for example, all of the {@link String} or {@link Integer}. * *

The framework will try and determine the exact type from the elements. In case of generic * elements, it may be necessary to manually supply the type information via {@link * #fromCollection(java.util.Collection, org.apache.flink.api.common.typeinfo.TypeInformation)}. * *

Note that this operation will result in a non-parallel data stream source, i.e. a data * stream source with a degree of parallelism one. * * @param data The array of elements to create the data stream from. * @param The type of the returned data stream * @return The data stream representing the given array of elements */ @SafeVarargs public final DataStreamSource fromElements(OUT... data) { if (data.length == 0) { throw new IllegalArgumentException( "fromElements needs at least one element as argument"); } TypeInformation typeInfo; try { typeInfo = TypeExtractor.getForObject(data[0]); } catch (Exception e) { throw new RuntimeException( "Could not create TypeInformation for type " + data[0].getClass().getName() + "; please specify the TypeInformation manually via " + "StreamExecutionEnvironment#fromElements(Collection, TypeInformation)", e); } return fromCollection(Arrays.asList(data), typeInfo); } /** * Creates a new data set that contains the given elements. The framework will determine the * type according to the based type user supplied. The elements should be the same or be the * subclass to the based type. The sequence of elements must not be empty. Note that this * operation will result in a non-parallel data stream source, i.e. a data stream source with a * degree of parallelism one. * * @param type The based class type in the collection. * @param data The array of elements to create the data stream from. * @param The type of the returned data stream * @return The data stream representing the given array of elements */ @SafeVarargs public final DataStreamSource fromElements(Class type, OUT... data) { if (data.length == 0) { throw new IllegalArgumentException( "fromElements needs at least one element as argument"); } TypeInformation typeInfo; try { typeInfo = TypeExtractor.getForClass(type); } catch (Exception e) { throw new RuntimeException( "Could not create TypeInformation for type " + type.getName() + "; please specify the TypeInformation manually via " + "StreamExecutionEnvironment#fromElements(Collection, TypeInformation)", e); } return fromCollection(Arrays.asList(data), typeInfo); } /** * Creates a data stream from the given non-empty collection. The type of the data stream is * that of the elements in the collection. * *

The framework will try and determine the exact type from the collection elements. In case * of generic elements, it may be necessary to manually supply the type information via {@link * #fromCollection(java.util.Collection, org.apache.flink.api.common.typeinfo.TypeInformation)}. * *

Note that this operation will result in a non-parallel data stream source, i.e. a data * stream source with parallelism one. * * @param data The collection of elements to create the data stream from. * @param The generic type of the returned data stream. * @return The data stream representing the given collection */ public DataStreamSource fromCollection(Collection data) { Preconditions.checkNotNull(data, "Collection must not be null"); if (data.isEmpty()) { throw new IllegalArgumentException("Collection must not be empty"); } OUT first = data.iterator().next(); if (first == null) { throw new IllegalArgumentException("Collection must not contain null elements"); } TypeInformation typeInfo; try { typeInfo = TypeExtractor.getForObject(first); } catch (Exception e) { throw new RuntimeException( "Could not create TypeInformation for type " + first.getClass() + "; please specify the TypeInformation manually via " + "StreamExecutionEnvironment#fromElements(Collection, TypeInformation)", e); } return fromCollection(data, typeInfo); } /** * Creates a data stream from the given non-empty collection. * *

Note that this operation will result in a non-parallel data stream source, i.e., a data * stream source with parallelism one. * * @param data The collection of elements to create the data stream from * @param typeInfo The TypeInformation for the produced data stream * @param The type of the returned data stream * @return The data stream representing the given collection */ public DataStreamSource fromCollection( Collection data, TypeInformation typeInfo) { Preconditions.checkNotNull(data, "Collection must not be null"); // must not have null elements and mixed elements FromElementsFunction.checkCollection(data, typeInfo.getTypeClass()); SourceFunction function; try { function = new FromElementsFunction<>(typeInfo.createSerializer(getConfig()), data); } catch (IOException e) { throw new RuntimeException(e.getMessage(), e); } return addSource(function, "Collection Source", typeInfo, Boundedness.BOUNDED) .setParallelism(1); } /** * Creates a data stream from the given iterator. * *

Because the iterator will remain unmodified until the actual execution happens, the type * of data returned by the iterator must be given explicitly in the form of the type class (this * is due to the fact that the Java compiler erases the generic type information). * *

Note that this operation will result in a non-parallel data stream source, i.e., a data * stream source with a parallelism of one. * * @param data The iterator of elements to create the data stream from * @param type The class of the data produced by the iterator. Must not be a generic class. * @param The type of the returned data stream * @return The data stream representing the elements in the iterator * @see #fromCollection(java.util.Iterator, * org.apache.flink.api.common.typeinfo.TypeInformation) */ public DataStreamSource fromCollection(Iterator data, Class type) { return fromCollection(data, TypeExtractor.getForClass(type)); } /** * Creates a data stream from the given iterator. * *

Because the iterator will remain unmodified until the actual execution happens, the type * of data returned by the iterator must be given explicitly in the form of the type * information. This method is useful for cases where the type is generic. In that case, the * type class (as given in {@link #fromCollection(java.util.Iterator, Class)} does not supply * all type information. * *

Note that this operation will result in a non-parallel data stream source, i.e., a data * stream source with parallelism one. * * @param data The iterator of elements to create the data stream from * @param typeInfo The TypeInformation for the produced data stream * @param The type of the returned data stream * @return The data stream representing the elements in the iterator */ public DataStreamSource fromCollection( Iterator data, TypeInformation typeInfo) { Preconditions.checkNotNull(data, "The iterator must not be null"); SourceFunction function = new FromIteratorFunction<>(data); return addSource(function, "Collection Source", typeInfo, Boundedness.BOUNDED); } /** * Creates a new data stream that contains elements in the iterator. The iterator is splittable, * allowing the framework to create a parallel data stream source that returns the elements in * the iterator. * *

Because the iterator will remain unmodified until the actual execution happens, the type * of data returned by the iterator must be given explicitly in the form of the type class (this * is due to the fact that the Java compiler erases the generic type information). * * @param iterator The iterator that produces the elements of the data stream * @param type The class of the data produced by the iterator. Must not be a generic class. * @param The type of the returned data stream * @return A data stream representing the elements in the iterator */ public DataStreamSource fromParallelCollection( SplittableIterator iterator, Class type) { return fromParallelCollection(iterator, TypeExtractor.getForClass(type)); } /** * Creates a new data stream that contains elements in the iterator. The iterator is splittable, * allowing the framework to create a parallel data stream source that returns the elements in * the iterator. * *

Because the iterator will remain unmodified until the actual execution happens, the type * of data returned by the iterator must be given explicitly in the form of the type * information. This method is useful for cases where the type is generic. In that case, the * type class (as given in {@link * #fromParallelCollection(org.apache.flink.util.SplittableIterator, Class)} does not supply all * type information. * * @param iterator The iterator that produces the elements of the data stream * @param typeInfo The TypeInformation for the produced data stream. * @param The type of the returned data stream * @return A data stream representing the elements in the iterator */ public DataStreamSource fromParallelCollection( SplittableIterator iterator, TypeInformation typeInfo) { return fromParallelCollection(iterator, typeInfo, "Parallel Collection Source"); } // private helper for passing different names private DataStreamSource fromParallelCollection( SplittableIterator iterator, TypeInformation typeInfo, String operatorName) { return addSource( new FromSplittableIteratorFunction<>(iterator), operatorName, typeInfo, Boundedness.BOUNDED); } /** * Reads the given file line-by-line and creates a data stream that contains a string with the * contents of each such line. The file will be read with the UTF-8 character set. * *

NOTES ON CHECKPOINTING: The source monitors the path, creates the {@link * org.apache.flink.core.fs.FileInputSplit FileInputSplits} to be processed, forwards them to * the downstream readers to read the actual data, and exits, without waiting for the readers to * finish reading. This implies that no more checkpoint barriers are going to be forwarded after * the source exits, thus having no checkpoints after that point. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or * "hdfs://host:port/file/path"). * @return The data stream that represents the data read from the given file as text lines */ public DataStreamSource readTextFile(String filePath) { return readTextFile(filePath, "UTF-8"); } /** * Reads the given file line-by-line and creates a data stream that contains a string with the * contents of each such line. The {@link java.nio.charset.Charset} with the given name will be * used to read the files. * *

NOTES ON CHECKPOINTING: The source monitors the path, creates the {@link * org.apache.flink.core.fs.FileInputSplit FileInputSplits} to be processed, forwards them to * the downstream readers to read the actual data, and exits, without waiting for the readers to * finish reading. This implies that no more checkpoint barriers are going to be forwarded after * the source exits, thus having no checkpoints after that point. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or * "hdfs://host:port/file/path") * @param charsetName The name of the character set used to read the file * @return The data stream that represents the data read from the given file as text lines */ public DataStreamSource readTextFile(String filePath, String charsetName) { Preconditions.checkArgument( !StringUtils.isNullOrWhitespaceOnly(filePath), "The file path must not be null or blank."); TextInputFormat format = new TextInputFormat(new Path(filePath)); format.setFilesFilter(FilePathFilter.createDefaultFilter()); TypeInformation typeInfo = BasicTypeInfo.STRING_TYPE_INFO; format.setCharsetName(charsetName); return readFile(format, filePath, FileProcessingMode.PROCESS_ONCE, -1, typeInfo); } /** * Reads the contents of the user-specified {@code filePath} based on the given {@link * FileInputFormat}. * *

Since all data streams need specific information about their types, this method needs to * determine the type of the data produced by the input format. It will attempt to determine the * data type by reflection, unless the input format implements the {@link * org.apache.flink.api.java.typeutils.ResultTypeQueryable} interface. In the latter case, this * method will invoke the {@link * org.apache.flink.api.java.typeutils.ResultTypeQueryable#getProducedType()} method to * determine data type produced by the input format. * *

NOTES ON CHECKPOINTING: The source monitors the path, creates the {@link * org.apache.flink.core.fs.FileInputSplit FileInputSplits} to be processed, forwards them to * the downstream readers to read the actual data, and exits, without waiting for the readers to * finish reading. This implies that no more checkpoint barriers are going to be forwarded after * the source exits, thus having no checkpoints after that point. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or * "hdfs://host:port/file/path") * @param inputFormat The input format used to create the data stream * @param The type of the returned data stream * @return The data stream that represents the data read from the given file */ public DataStreamSource readFile(FileInputFormat inputFormat, String filePath) { return readFile(inputFormat, filePath, FileProcessingMode.PROCESS_ONCE, -1); } /** * Reads the contents of the user-specified {@code filePath} based on the given {@link * FileInputFormat}. Depending on the provided {@link FileProcessingMode}. * *

See {@link #readFile(FileInputFormat, String, FileProcessingMode, long)} * * @param inputFormat The input format used to create the data stream * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or * "hdfs://host:port/file/path") * @param watchType The mode in which the source should operate, i.e. monitor path and react to * new data, or process once and exit * @param interval In the case of periodic path monitoring, this specifies the interval (in * millis) between consecutive path scans * @param filter The files to be excluded from the processing * @param The type of the returned data stream * @return The data stream that represents the data read from the given file * @deprecated Use {@link FileInputFormat#setFilesFilter(FilePathFilter)} to set a filter and * {@link StreamExecutionEnvironment#readFile(FileInputFormat, String, FileProcessingMode, * long)} */ @PublicEvolving @Deprecated public DataStreamSource readFile( FileInputFormat inputFormat, String filePath, FileProcessingMode watchType, long interval, FilePathFilter filter) { inputFormat.setFilesFilter(filter); TypeInformation typeInformation; try { typeInformation = TypeExtractor.getInputFormatTypes(inputFormat); } catch (Exception e) { throw new InvalidProgramException( "The type returned by the input format could not be " + "automatically determined. Please specify the TypeInformation of the produced type " + "explicitly by using the 'createInput(InputFormat, TypeInformation)' method instead."); } return readFile(inputFormat, filePath, watchType, interval, typeInformation); } /** * Reads the contents of the user-specified {@code filePath} based on the given {@link * FileInputFormat}. Depending on the provided {@link FileProcessingMode}, the source may * periodically monitor (every {@code interval} ms) the path for new data ({@link * FileProcessingMode#PROCESS_CONTINUOUSLY}), or process once the data currently in the path and * exit ({@link FileProcessingMode#PROCESS_ONCE}). In addition, if the path contains files not * to be processed, the user can specify a custom {@link FilePathFilter}. As a default * implementation you can use {@link FilePathFilter#createDefaultFilter()}. * *

Since all data streams need specific information about their types, this method needs to * determine the type of the data produced by the input format. It will attempt to determine the * data type by reflection, unless the input format implements the {@link * org.apache.flink.api.java.typeutils.ResultTypeQueryable} interface. In the latter case, this * method will invoke the {@link * org.apache.flink.api.java.typeutils.ResultTypeQueryable#getProducedType()} method to * determine data type produced by the input format. * *

NOTES ON CHECKPOINTING: If the {@code watchType} is set to {@link * FileProcessingMode#PROCESS_ONCE}, the source monitors the path once, creates the * {@link org.apache.flink.core.fs.FileInputSplit FileInputSplits} to be processed, forwards * them to the downstream readers to read the actual data, and exits, without waiting for the * readers to finish reading. This implies that no more checkpoint barriers are going to be * forwarded after the source exits, thus having no checkpoints after that point. * * @param inputFormat The input format used to create the data stream * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or * "hdfs://host:port/file/path") * @param watchType The mode in which the source should operate, i.e. monitor path and react to * new data, or process once and exit * @param interval In the case of periodic path monitoring, this specifies the interval (in * millis) between consecutive path scans * @param The type of the returned data stream * @return The data stream that represents the data read from the given file */ @PublicEvolving public DataStreamSource readFile( FileInputFormat inputFormat, String filePath, FileProcessingMode watchType, long interval) { TypeInformation typeInformation; try { typeInformation = TypeExtractor.getInputFormatTypes(inputFormat); } catch (Exception e) { throw new InvalidProgramException( "The type returned by the input format could not be " + "automatically determined. Please specify the TypeInformation of the produced type " + "explicitly by using the 'createInput(InputFormat, TypeInformation)' method instead."); } return readFile(inputFormat, filePath, watchType, interval, typeInformation); } /** * Creates a data stream that contains the contents of file created while system watches the * given path. The file will be read with the system's default character set. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or * "hdfs://host:port/file/path/") * @param intervalMillis The interval of file watching in milliseconds * @param watchType The watch type of file stream. When watchType is {@link * org.apache.flink.streaming.api.functions.source.FileMonitoringFunction.WatchType#ONLY_NEW_FILES}, * the system processes only new files. {@link * org.apache.flink.streaming.api.functions.source.FileMonitoringFunction.WatchType#REPROCESS_WITH_APPENDED} * means that the system re-processes all contents of appended file. {@link * org.apache.flink.streaming.api.functions.source.FileMonitoringFunction.WatchType#PROCESS_ONLY_APPENDED} * means that the system processes only appended contents of files. * @return The DataStream containing the given directory. * @deprecated Use {@link #readFile(FileInputFormat, String, FileProcessingMode, long)} instead. */ @Deprecated @SuppressWarnings("deprecation") public DataStream readFileStream( String filePath, long intervalMillis, FileMonitoringFunction.WatchType watchType) { DataStream> source = addSource( new FileMonitoringFunction(filePath, intervalMillis, watchType), "Read File Stream source"); return source.flatMap(new FileReadFunction()); } /** * Reads the contents of the user-specified {@code filePath} based on the given {@link * FileInputFormat}. Depending on the provided {@link FileProcessingMode}, the source may * periodically monitor (every {@code interval} ms) the path for new data ({@link * FileProcessingMode#PROCESS_CONTINUOUSLY}), or process once the data currently in the path and * exit ({@link FileProcessingMode#PROCESS_ONCE}). In addition, if the path contains files not * to be processed, the user can specify a custom {@link FilePathFilter}. As a default * implementation you can use {@link FilePathFilter#createDefaultFilter()}. * *

NOTES ON CHECKPOINTING: If the {@code watchType} is set to {@link * FileProcessingMode#PROCESS_ONCE}, the source monitors the path once, creates the * {@link org.apache.flink.core.fs.FileInputSplit FileInputSplits} to be processed, forwards * them to the downstream readers to read the actual data, and exits, without waiting for the * readers to finish reading. This implies that no more checkpoint barriers are going to be * forwarded after the source exits, thus having no checkpoints after that point. * * @param inputFormat The input format used to create the data stream * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or * "hdfs://host:port/file/path") * @param watchType The mode in which the source should operate, i.e. monitor path and react to * new data, or process once and exit * @param typeInformation Information on the type of the elements in the output stream * @param interval In the case of periodic path monitoring, this specifies the interval (in * millis) between consecutive path scans * @param The type of the returned data stream * @return The data stream that represents the data read from the given file */ @PublicEvolving public DataStreamSource readFile( FileInputFormat inputFormat, String filePath, FileProcessingMode watchType, long interval, TypeInformation typeInformation) { Preconditions.checkNotNull(inputFormat, "InputFormat must not be null."); Preconditions.checkArgument( !StringUtils.isNullOrWhitespaceOnly(filePath), "The file path must not be null or blank."); inputFormat.setFilePath(filePath); return createFileInput( inputFormat, typeInformation, "Custom File Source", watchType, interval); } /** * Creates a new data stream that contains the strings received infinitely from a socket. * Received strings are decoded by the system's default character set. On the termination of the * socket server connection retries can be initiated. * *

Let us note that the socket itself does not report on abort and as a consequence retries * are only initiated when the socket was gracefully terminated. * * @param hostname The host name which a server socket binds * @param port The port number which a server socket binds. A port number of 0 means that the * port number is automatically allocated. * @param delimiter A character which splits received strings into records * @param maxRetry The maximal retry interval in seconds while the program waits for a socket * that is temporarily down. Reconnection is initiated every second. A number of 0 means * that the reader is immediately terminated, while a negative value ensures retrying * forever. * @return A data stream containing the strings received from the socket * @deprecated Use {@link #socketTextStream(String, int, String, long)} instead. */ @Deprecated public DataStreamSource socketTextStream( String hostname, int port, char delimiter, long maxRetry) { return socketTextStream(hostname, port, String.valueOf(delimiter), maxRetry); } /** * Creates a new data stream that contains the strings received infinitely from a socket. * Received strings are decoded by the system's default character set. On the termination of the * socket server connection retries can be initiated. * *

Let us note that the socket itself does not report on abort and as a consequence retries * are only initiated when the socket was gracefully terminated. * * @param hostname The host name which a server socket binds * @param port The port number which a server socket binds. A port number of 0 means that the * port number is automatically allocated. * @param delimiter A string which splits received strings into records * @param maxRetry The maximal retry interval in seconds while the program waits for a socket * that is temporarily down. Reconnection is initiated every second. A number of 0 means * that the reader is immediately terminated, while a negative value ensures retrying * forever. * @return A data stream containing the strings received from the socket */ @PublicEvolving public DataStreamSource socketTextStream( String hostname, int port, String delimiter, long maxRetry) { return addSource( new SocketTextStreamFunction(hostname, port, delimiter, maxRetry), "Socket Stream"); } /** * Creates a new data stream that contains the strings received infinitely from a socket. * Received strings are decoded by the system's default character set. The reader is terminated * immediately when the socket is down. * * @param hostname The host name which a server socket binds * @param port The port number which a server socket binds. A port number of 0 means that the * port number is automatically allocated. * @param delimiter A character which splits received strings into records * @return A data stream containing the strings received from the socket * @deprecated Use {@link #socketTextStream(String, int, String)} instead. */ @Deprecated @SuppressWarnings("deprecation") public DataStreamSource socketTextStream(String hostname, int port, char delimiter) { return socketTextStream(hostname, port, delimiter, 0); } /** * Creates a new data stream that contains the strings received infinitely from a socket. * Received strings are decoded by the system's default character set. The reader is terminated * immediately when the socket is down. * * @param hostname The host name which a server socket binds * @param port The port number which a server socket binds. A port number of 0 means that the * port number is automatically allocated. * @param delimiter A string which splits received strings into records * @return A data stream containing the strings received from the socket */ @PublicEvolving public DataStreamSource socketTextStream(String hostname, int port, String delimiter) { return socketTextStream(hostname, port, delimiter, 0); } /** * Creates a new data stream that contains the strings received infinitely from a socket. * Received strings are decoded by the system's default character set, using"\n" as delimiter. * The reader is terminated immediately when the socket is down. * * @param hostname The host name which a server socket binds * @param port The port number which a server socket binds. A port number of 0 means that the * port number is automatically allocated. * @return A data stream containing the strings received from the socket */ @PublicEvolving public DataStreamSource socketTextStream(String hostname, int port) { return socketTextStream(hostname, port, "\n"); } /** * Generic method to create an input data stream with {@link * org.apache.flink.api.common.io.InputFormat}. * *

Since all data streams need specific information about their types, this method needs to * determine the type of the data produced by the input format. It will attempt to determine the * data type by reflection, unless the input format implements the {@link * org.apache.flink.api.java.typeutils.ResultTypeQueryable} interface. In the latter case, this * method will invoke the {@link * org.apache.flink.api.java.typeutils.ResultTypeQueryable#getProducedType()} method to * determine data type produced by the input format. * *

NOTES ON CHECKPOINTING: In the case of a {@link FileInputFormat}, the source * (which executes the {@link ContinuousFileMonitoringFunction}) monitors the path, creates the * {@link org.apache.flink.core.fs.FileInputSplit FileInputSplits} to be processed, forwards * them to the downstream readers to read the actual data, and exits, without waiting for the * readers to finish reading. This implies that no more checkpoint barriers are going to be * forwarded after the source exits, thus having no checkpoints. * * @param inputFormat The input format used to create the data stream * @param The type of the returned data stream * @return The data stream that represents the data created by the input format */ @PublicEvolving public DataStreamSource createInput(InputFormat inputFormat) { return createInput(inputFormat, TypeExtractor.getInputFormatTypes(inputFormat)); } /** * Generic method to create an input data stream with {@link * org.apache.flink.api.common.io.InputFormat}. * *

The data stream is typed to the given TypeInformation. This method is intended for input * formats where the return type cannot be determined by reflection analysis, and that do not * implement the {@link org.apache.flink.api.java.typeutils.ResultTypeQueryable} interface. * *

NOTES ON CHECKPOINTING: In the case of a {@link FileInputFormat}, the source * (which executes the {@link ContinuousFileMonitoringFunction}) monitors the path, creates the * {@link org.apache.flink.core.fs.FileInputSplit FileInputSplits} to be processed, forwards * them to the downstream readers to read the actual data, and exits, without waiting for the * readers to finish reading. This implies that no more checkpoint barriers are going to be * forwarded after the source exits, thus having no checkpoints. * * @param inputFormat The input format used to create the data stream * @param typeInfo The information about the type of the output type * @param The type of the returned data stream * @return The data stream that represents the data created by the input format */ @PublicEvolving public DataStreamSource createInput( InputFormat inputFormat, TypeInformation typeInfo) { DataStreamSource source; if (inputFormat instanceof FileInputFormat) { @SuppressWarnings("unchecked") FileInputFormat format = (FileInputFormat) inputFormat; source = createFileInput( format, typeInfo, "Custom File source", FileProcessingMode.PROCESS_ONCE, -1); } else { source = createInput(inputFormat, typeInfo, "Custom Source"); } return source; } private DataStreamSource createInput( InputFormat inputFormat, TypeInformation typeInfo, String sourceName) { InputFormatSourceFunction function = new InputFormatSourceFunction<>(inputFormat, typeInfo); return addSource(function, sourceName, typeInfo); } private DataStreamSource createFileInput( FileInputFormat inputFormat, TypeInformation typeInfo, String sourceName, FileProcessingMode monitoringMode, long interval) { Preconditions.checkNotNull(inputFormat, "Unspecified file input format."); Preconditions.checkNotNull(typeInfo, "Unspecified output type information."); Preconditions.checkNotNull(sourceName, "Unspecified name for the source."); Preconditions.checkNotNull(monitoringMode, "Unspecified monitoring mode."); Preconditions.checkArgument( monitoringMode.equals(FileProcessingMode.PROCESS_ONCE) || interval >= ContinuousFileMonitoringFunction.MIN_MONITORING_INTERVAL, "The path monitoring interval cannot be less than " + ContinuousFileMonitoringFunction.MIN_MONITORING_INTERVAL + " ms."); ContinuousFileMonitoringFunction monitoringFunction = new ContinuousFileMonitoringFunction<>( inputFormat, monitoringMode, getParallelism(), interval); ContinuousFileReaderOperatorFactory factory = new ContinuousFileReaderOperatorFactory<>(inputFormat); final Boundedness boundedness = monitoringMode == FileProcessingMode.PROCESS_ONCE ? Boundedness.BOUNDED : Boundedness.CONTINUOUS_UNBOUNDED; SingleOutputStreamOperator source = addSource(monitoringFunction, sourceName, null, boundedness) .transform("Split Reader: " + sourceName, typeInfo, factory); return new DataStreamSource<>(source); } /** * Adds a Data Source to the streaming topology. * *

By default sources have a parallelism of 1. To enable parallel execution, the user defined * source should implement {@link * org.apache.flink.streaming.api.functions.source.ParallelSourceFunction} or extend {@link * org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction}. In these cases * the resulting source will have the parallelism of the environment. To change this afterwards * call {@link org.apache.flink.streaming.api.datastream.DataStreamSource#setParallelism(int)} * * @param function the user defined function * @param type of the returned stream * @return the data stream constructed */ public DataStreamSource addSource(SourceFunction function) { return addSource(function, "Custom Source"); } /** * Adds a data source with a custom type information thus opening a {@link DataStream}. Only in * very special cases does the user need to support type information. Otherwise use {@link * #addSource(org.apache.flink.streaming.api.functions.source.SourceFunction)} * * @param function the user defined function * @param sourceName Name of the data source * @param type of the returned stream * @return the data stream constructed */ public DataStreamSource addSource(SourceFunction function, String sourceName) { return addSource(function, sourceName, null); } /** * Ads a data source with a custom type information thus opening a {@link DataStream}. Only in * very special cases does the user need to support type information. Otherwise use {@link * #addSource(org.apache.flink.streaming.api.functions.source.SourceFunction)} * * @param function the user defined function * @param type of the returned stream * @param typeInfo the user defined type information for the stream * @return the data stream constructed */ public DataStreamSource addSource( SourceFunction function, TypeInformation typeInfo) { return addSource(function, "Custom Source", typeInfo); } /** * Ads a data source with a custom type information thus opening a {@link DataStream}. Only in * very special cases does the user need to support type information. Otherwise use {@link * #addSource(org.apache.flink.streaming.api.functions.source.SourceFunction)} * * @param function the user defined function * @param sourceName Name of the data source * @param type of the returned stream * @param typeInfo the user defined type information for the stream * @return the data stream constructed */ public DataStreamSource addSource( SourceFunction function, String sourceName, TypeInformation typeInfo) { return addSource(function, sourceName, typeInfo, Boundedness.CONTINUOUS_UNBOUNDED); } private DataStreamSource addSource( final SourceFunction function, final String sourceName, @Nullable final TypeInformation typeInfo, final Boundedness boundedness) { checkNotNull(function); checkNotNull(sourceName); checkNotNull(boundedness); TypeInformation resolvedTypeInfo = getTypeInfo(function, sourceName, SourceFunction.class, typeInfo); boolean isParallel = function instanceof ParallelSourceFunction; clean(function); final StreamSource sourceOperator = new StreamSource<>(function); return new DataStreamSource<>( this, resolvedTypeInfo, sourceOperator, isParallel, sourceName, boundedness); } /** * Adds a data {@link Source} to the environment to get a {@link DataStream}. * *

The result will be either a bounded data stream (that can be processed in a batch way) or * an unbounded data stream (that must be processed in a streaming way), based on the * boundedness property of the source, as defined by {@link Source#getBoundedness()}. * *

The result type (that is used to create serializers for the produced data events) will be * automatically extracted. This is useful for sources that describe the produced types already * in their configuration, to avoid having to declare the type multiple times. For example the * file sources and Kafka sources already define the produced byte their * parsers/serializers/formats, and can forward that information. * * @param source the user defined source * @param sourceName Name of the data source * @param type of the returned stream * @return the data stream constructed */ @Experimental public DataStreamSource fromSource( Source source, WatermarkStrategy timestampsAndWatermarks, String sourceName) { return fromSource(source, timestampsAndWatermarks, sourceName, null); } /** * Adds a data {@link Source} to the environment to get a {@link DataStream}. * *

The result will be either a bounded data stream (that can be processed in a batch way) or * an unbounded data stream (that must be processed in a streaming way), based on the * boundedness property of the source, as defined by {@link Source#getBoundedness()}. * *

This method takes an explicit type information for the produced data stream, so that * callers can define directly what type/serializer will be used for the produced stream. For * sources that describe their produced type, the method {@link #fromSource(Source, * WatermarkStrategy, String)} can be used to avoid specifying the produced type redundantly. * * @param source the user defined source * @param sourceName Name of the data source * @param type of the returned stream * @param typeInfo the user defined type information for the stream * @return the data stream constructed */ @Experimental public DataStreamSource fromSource( Source source, WatermarkStrategy timestampsAndWatermarks, String sourceName, TypeInformation typeInfo) { final TypeInformation resolvedTypeInfo = getTypeInfo(source, sourceName, Source.class, typeInfo); return new DataStreamSource<>( this, checkNotNull(source, "source"), checkNotNull(timestampsAndWatermarks, "timestampsAndWatermarks"), checkNotNull(resolvedTypeInfo), checkNotNull(sourceName)); } /** * Triggers the program execution. The environment will execute all parts of the program that * have resulted in a "sink" operation. Sink operations are for example printing results or * forwarding them to a message queue. * *

The program execution will be logged and displayed with a generated default name. * * @return The result of the job execution, containing elapsed time and accumulators. * @throws Exception which occurs during job execution. */ public JobExecutionResult execute() throws Exception { return execute(getJobName()); } /** * Triggers the program execution. The environment will execute all parts of the program that * have resulted in a "sink" operation. Sink operations are for example printing results or * forwarding them to a message queue. * *

The program execution will be logged and displayed with the provided name * * @param jobName Desired name of the job * @return The result of the job execution, containing elapsed time and accumulators. * @throws Exception which occurs during job execution. */ public JobExecutionResult execute(String jobName) throws Exception { Preconditions.checkNotNull(jobName, "Streaming Job name should not be null."); return execute(getStreamGraph(jobName)); } /** * Triggers the program execution. The environment will execute all parts of the program that * have resulted in a "sink" operation. Sink operations are for example printing results or * forwarding them to a message queue. * * @param streamGraph the stream graph representing the transformations * @return The result of the job execution, containing elapsed time and accumulators. * @throws Exception which occurs during job execution. */ @Internal public JobExecutionResult execute(StreamGraph streamGraph) throws Exception { final JobClient jobClient = executeAsync(streamGraph); try { final JobExecutionResult jobExecutionResult; if (configuration.getBoolean(DeploymentOptions.ATTACHED)) { jobExecutionResult = jobClient.getJobExecutionResult().get(); } else { jobExecutionResult = new DetachedJobExecutionResult(jobClient.getJobID()); } jobListeners.forEach( jobListener -> jobListener.onJobExecuted(jobExecutionResult, null)); return jobExecutionResult; } catch (Throwable t) { // get() on the JobExecutionResult Future will throw an ExecutionException. This // behaviour was largely not there in Flink versions before the PipelineExecutor // refactoring so we should strip that exception. Throwable strippedException = ExceptionUtils.stripExecutionException(t); jobListeners.forEach( jobListener -> { jobListener.onJobExecuted(null, strippedException); }); ExceptionUtils.rethrowException(strippedException); // never reached, only make javac happy return null; } } /** * Register a {@link JobListener} in this environment. The {@link JobListener} will be notified * on specific job status changed. */ @PublicEvolving public void registerJobListener(JobListener jobListener) { checkNotNull(jobListener, "JobListener cannot be null"); jobListeners.add(jobListener); } /** Clear all registered {@link JobListener}s. */ @PublicEvolving public void clearJobListeners() { this.jobListeners.clear(); } /** * Triggers the program asynchronously. The environment will execute all parts of the program * that have resulted in a "sink" operation. Sink operations are for example printing results or * forwarding them to a message queue. * *

The program execution will be logged and displayed with a generated default name. * * @return A {@link JobClient} that can be used to communicate with the submitted job, completed * on submission succeeded. * @throws Exception which occurs during job execution. */ @PublicEvolving public final JobClient executeAsync() throws Exception { return executeAsync(getJobName()); } /** * Triggers the program execution asynchronously. The environment will execute all parts of the * program that have resulted in a "sink" operation. Sink operations are for example printing * results or forwarding them to a message queue. * *

The program execution will be logged and displayed with the provided name * * @param jobName desired name of the job * @return A {@link JobClient} that can be used to communicate with the submitted job, completed * on submission succeeded. * @throws Exception which occurs during job execution. */ @PublicEvolving public JobClient executeAsync(String jobName) throws Exception { return executeAsync(getStreamGraph(checkNotNull(jobName))); } /** * Triggers the program execution asynchronously. The environment will execute all parts of the * program that have resulted in a "sink" operation. Sink operations are for example printing * results or forwarding them to a message queue. * * @param streamGraph the stream graph representing the transformations * @return A {@link JobClient} that can be used to communicate with the submitted job, completed * on submission succeeded. * @throws Exception which occurs during job execution. */ @Internal public JobClient executeAsync(StreamGraph streamGraph) throws Exception { checkNotNull(streamGraph, "StreamGraph cannot be null."); checkNotNull( configuration.get(DeploymentOptions.TARGET), "No execution.target specified in your configuration file."); final PipelineExecutorFactory executorFactory = executorServiceLoader.getExecutorFactory(configuration); checkNotNull( executorFactory, "Cannot find compatible factory for specified execution.target (=%s)", configuration.get(DeploymentOptions.TARGET)); CompletableFuture jobClientFuture = executorFactory .getExecutor(configuration) .execute(streamGraph, configuration, userClassloader); try { JobClient jobClient = jobClientFuture.get(); jobListeners.forEach(jobListener -> jobListener.onJobSubmitted(jobClient, null)); return jobClient; } catch (ExecutionException executionException) { final Throwable strippedException = ExceptionUtils.stripExecutionException(executionException); jobListeners.forEach( jobListener -> jobListener.onJobSubmitted(null, strippedException)); throw new FlinkException( String.format("Failed to execute job '%s'.", streamGraph.getJobName()), strippedException); } } /** * Getter of the {@link org.apache.flink.streaming.api.graph.StreamGraph} of the streaming job. * This call clears previously registered {@link Transformation transformations}. * * @return The streamgraph representing the transformations */ @Internal public StreamGraph getStreamGraph() { return getStreamGraph(getJobName()); } /** * Getter of the {@link org.apache.flink.streaming.api.graph.StreamGraph} of the streaming job. * This call clears previously registered {@link Transformation transformations}. * * @param jobName Desired name of the job * @return The streamgraph representing the transformations */ @Internal public StreamGraph getStreamGraph(String jobName) { return getStreamGraph(jobName, true); } /** * Getter of the {@link org.apache.flink.streaming.api.graph.StreamGraph StreamGraph} of the * streaming job with the option to clear previously registered {@link Transformation * transformations}. Clearing the transformations allows, for example, to not re-execute the * same operations when calling {@link #execute()} multiple times. * * @param jobName Desired name of the job * @param clearTransformations Whether or not to clear previously registered transformations * @return The streamgraph representing the transformations */ @Internal public StreamGraph getStreamGraph(String jobName, boolean clearTransformations) { StreamGraph streamGraph = getStreamGraphGenerator().setJobName(jobName).generate(); if (clearTransformations) { this.transformations.clear(); } return streamGraph; } private StreamGraphGenerator getStreamGraphGenerator() { if (transformations.size() <= 0) { throw new IllegalStateException( "No operators defined in streaming topology. Cannot execute."); } final RuntimeExecutionMode executionMode = configuration.get(ExecutionOptions.RUNTIME_MODE); return new StreamGraphGenerator(transformations, config, checkpointCfg, getConfiguration()) .setRuntimeExecutionMode(executionMode) .setStateBackend(defaultStateBackend) .setChaining(isChainingEnabled) .setUserArtifacts(cacheFile) .setTimeCharacteristic(timeCharacteristic) .setDefaultBufferTimeout(bufferTimeout); } /** * Creates the plan with which the system will execute the program, and returns it as a String * using a JSON representation of the execution data flow graph. Note that this needs to be * called, before the plan is executed. * * @return The execution plan of the program, as a JSON String. */ public String getExecutionPlan() { return getStreamGraph(getJobName(), false).getStreamingPlanAsJSON(); } /** * Returns a "closure-cleaned" version of the given function. Cleans only if closure cleaning is * not disabled in the {@link org.apache.flink.api.common.ExecutionConfig} */ @Internal public F clean(F f) { if (getConfig().isClosureCleanerEnabled()) { ClosureCleaner.clean(f, getConfig().getClosureCleanerLevel(), true); } ClosureCleaner.ensureSerializable(f); return f; } /** * Adds an operator to the list of operators that should be executed when calling {@link * #execute}. * *

When calling {@link #execute()} only the operators that where previously added to the list * are executed. * *

This is not meant to be used by users. The API methods that create operators must call * this method. */ @Internal public void addOperator(Transformation transformation) { Preconditions.checkNotNull(transformation, "transformation must not be null."); this.transformations.add(transformation); } // -------------------------------------------------------------------------------------------- // Factory methods for ExecutionEnvironments // -------------------------------------------------------------------------------------------- /** * Creates an execution environment that represents the context in which the program is * currently executed. If the program is invoked standalone, this method returns a local * execution environment, as returned by {@link #createLocalEnvironment()}. * * @return The execution environment of the context in which the program is executed. */ public static StreamExecutionEnvironment getExecutionEnvironment() { return getExecutionEnvironment(new Configuration()); } /** * Creates an execution environment that represents the context in which the program is * currently executed. If the program is invoked standalone, this method returns a local * execution environment, as returned by {@link #createLocalEnvironment(Configuration)}. * *

When executed from the command line the given configuration is stacked on top of the * global configuration which comes from the {@code flink-conf.yaml}, potentially overriding * duplicated options. * * @param configuration The configuration to instantiate the environment with. * @return The execution environment of the context in which the program is executed. */ public static StreamExecutionEnvironment getExecutionEnvironment(Configuration configuration) { return Utils.resolveFactory(threadLocalContextEnvironmentFactory, contextEnvironmentFactory) .map(factory -> factory.createExecutionEnvironment(configuration)) .orElseGet(() -> StreamExecutionEnvironment.createLocalEnvironment(configuration)); } /** * Creates a {@link LocalStreamEnvironment}. The local execution environment will run the * program in a multi-threaded fashion in the same JVM as the environment was created in. The * default parallelism of the local environment is the number of hardware contexts (CPU cores / * threads), unless it was specified differently by {@link #setParallelism(int)}. * * @return A local execution environment. */ public static LocalStreamEnvironment createLocalEnvironment() { return createLocalEnvironment(defaultLocalParallelism); } /** * Creates a {@link LocalStreamEnvironment}. The local execution environment will run the * program in a multi-threaded fashion in the same JVM as the environment was created in. It * will use the parallelism specified in the parameter. * * @param parallelism The parallelism for the local environment. * @return A local execution environment with the specified parallelism. */ public static LocalStreamEnvironment createLocalEnvironment(int parallelism) { return createLocalEnvironment(parallelism, new Configuration()); } /** * Creates a {@link LocalStreamEnvironment}. The local execution environment will run the * program in a multi-threaded fashion in the same JVM as the environment was created in. It * will use the parallelism specified in the parameter. * * @param parallelism The parallelism for the local environment. * @param configuration Pass a custom configuration into the cluster * @return A local execution environment with the specified parallelism. */ public static LocalStreamEnvironment createLocalEnvironment( int parallelism, Configuration configuration) { Configuration copyOfConfiguration = new Configuration(); copyOfConfiguration.addAll(configuration); copyOfConfiguration.set(CoreOptions.DEFAULT_PARALLELISM, parallelism); return createLocalEnvironment(copyOfConfiguration); } /** * Creates a {@link LocalStreamEnvironment}. The local execution environment will run the * program in a multi-threaded fashion in the same JVM as the environment was created in. * * @param configuration Pass a custom configuration into the cluster * @return A local execution environment with the specified parallelism. */ public static LocalStreamEnvironment createLocalEnvironment(Configuration configuration) { if (configuration.getOptional(CoreOptions.DEFAULT_PARALLELISM).isPresent()) { return new LocalStreamEnvironment(configuration); } else { Configuration copyOfConfiguration = new Configuration(); copyOfConfiguration.addAll(configuration); copyOfConfiguration.set(CoreOptions.DEFAULT_PARALLELISM, defaultLocalParallelism); return new LocalStreamEnvironment(copyOfConfiguration); } } /** * Creates a {@link LocalStreamEnvironment} for local program execution that also starts the web * monitoring UI. * *

The local execution environment will run the program in a multi-threaded fashion in the * same JVM as the environment was created in. It will use the parallelism specified in the * parameter. * *

If the configuration key 'rest.port' was set in the configuration, that particular port * will be used for the web UI. Otherwise, the default port (8081) will be used. */ @PublicEvolving public static StreamExecutionEnvironment createLocalEnvironmentWithWebUI(Configuration conf) { checkNotNull(conf, "conf"); if (!conf.contains(RestOptions.PORT)) { // explicitly set this option so that it's not set to 0 later conf.setInteger(RestOptions.PORT, RestOptions.PORT.defaultValue()); } return createLocalEnvironment(conf); } /** * Creates a {@link RemoteStreamEnvironment}. The remote environment sends (parts of) the * program to a cluster for execution. Note that all file paths used in the program must be * accessible from the cluster. The execution will use no parallelism, unless the parallelism is * set explicitly via {@link #setParallelism}. * * @param host The host name or address of the master (JobManager), where the program should be * executed. * @param port The port of the master (JobManager), where the program should be executed. * @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the * program uses user-defined functions, user-defined input formats, or any libraries, those * must be provided in the JAR files. * @return A remote environment that executes the program on a cluster. */ public static StreamExecutionEnvironment createRemoteEnvironment( String host, int port, String... jarFiles) { return new RemoteStreamEnvironment(host, port, jarFiles); } /** * Creates a {@link RemoteStreamEnvironment}. The remote environment sends (parts of) the * program to a cluster for execution. Note that all file paths used in the program must be * accessible from the cluster. The execution will use the specified parallelism. * * @param host The host name or address of the master (JobManager), where the program should be * executed. * @param port The port of the master (JobManager), where the program should be executed. * @param parallelism The parallelism to use during the execution. * @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the * program uses user-defined functions, user-defined input formats, or any libraries, those * must be provided in the JAR files. * @return A remote environment that executes the program on a cluster. */ public static StreamExecutionEnvironment createRemoteEnvironment( String host, int port, int parallelism, String... jarFiles) { RemoteStreamEnvironment env = new RemoteStreamEnvironment(host, port, jarFiles); env.setParallelism(parallelism); return env; } /** * Creates a {@link RemoteStreamEnvironment}. The remote environment sends (parts of) the * program to a cluster for execution. Note that all file paths used in the program must be * accessible from the cluster. The execution will use the specified parallelism. * * @param host The host name or address of the master (JobManager), where the program should be * executed. * @param port The port of the master (JobManager), where the program should be executed. * @param clientConfig The configuration used by the client that connects to the remote cluster. * @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the * program uses user-defined functions, user-defined input formats, or any libraries, those * must be provided in the JAR files. * @return A remote environment that executes the program on a cluster. */ public static StreamExecutionEnvironment createRemoteEnvironment( String host, int port, Configuration clientConfig, String... jarFiles) { return new RemoteStreamEnvironment(host, port, clientConfig, jarFiles); } /** * Gets the default parallelism that will be used for the local execution environment created by * {@link #createLocalEnvironment()}. * * @return The default local parallelism */ @PublicEvolving public static int getDefaultLocalParallelism() { return defaultLocalParallelism; } /** * Sets the default parallelism that will be used for the local execution environment created by * {@link #createLocalEnvironment()}. * * @param parallelism The parallelism to use as the default local parallelism. */ @PublicEvolving public static void setDefaultLocalParallelism(int parallelism) { defaultLocalParallelism = parallelism; } // -------------------------------------------------------------------------------------------- // Methods to control the context and local environments for execution from packaged programs // -------------------------------------------------------------------------------------------- protected static void initializeContextEnvironment(StreamExecutionEnvironmentFactory ctx) { contextEnvironmentFactory = ctx; threadLocalContextEnvironmentFactory.set(contextEnvironmentFactory); } protected static void resetContextEnvironment() { contextEnvironmentFactory = null; threadLocalContextEnvironmentFactory.remove(); } /** * Registers a file at the distributed cache under the given name. The file will be accessible * from any user-defined function in the (distributed) runtime under a local path. Files may be * local files (which will be distributed via BlobServer), or files in a distributed file * system. The runtime will copy the files temporarily to a local cache, if needed. * *

The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside * UDFs via {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and * provides access {@link org.apache.flink.api.common.cache.DistributedCache} via {@link * org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}. * * @param filePath The path of the file, as a URI (e.g. "file:///some/path" or * "hdfs://host:port/and/path") * @param name The name under which the file is registered. */ public void registerCachedFile(String filePath, String name) { registerCachedFile(filePath, name, false); } /** * Registers a file at the distributed cache under the given name. The file will be accessible * from any user-defined function in the (distributed) runtime under a local path. Files may be * local files (which will be distributed via BlobServer), or files in a distributed file * system. The runtime will copy the files temporarily to a local cache, if needed. * *

The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside * UDFs via {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and * provides access {@link org.apache.flink.api.common.cache.DistributedCache} via {@link * org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}. * * @param filePath The path of the file, as a URI (e.g. "file:///some/path" or * "hdfs://host:port/and/path") * @param name The name under which the file is registered. * @param executable flag indicating whether the file should be executable */ public void registerCachedFile(String filePath, String name, boolean executable) { this.cacheFile.add( new Tuple2<>( name, new DistributedCache.DistributedCacheEntry(filePath, executable))); } // Private helpers. @SuppressWarnings("unchecked") private > T getTypeInfo( Object source, String sourceName, Class baseSourceClass, TypeInformation typeInfo) { TypeInformation resolvedTypeInfo = typeInfo; if (resolvedTypeInfo == null && source instanceof ResultTypeQueryable) { resolvedTypeInfo = ((ResultTypeQueryable) source).getProducedType(); } if (resolvedTypeInfo == null) { try { resolvedTypeInfo = TypeExtractor.createTypeInfo( baseSourceClass, source.getClass(), 0, null, null); } catch (final InvalidTypesException e) { resolvedTypeInfo = (TypeInformation) new MissingTypeInfo(sourceName, e); } } return (T) resolvedTypeInfo; } private String getJobName() { return configuration.getString(PipelineOptions.NAME, DEFAULT_JOB_NAME); } } ================================================ FILE: fire-shell/flink-shell/src/main/java-flink-1.13/org.apache.flink.streaming.api.environment/StreamExecutionEnvironment.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.streaming.api.environment; import org.apache.flink.annotation.Experimental; import org.apache.flink.annotation.Internal; import org.apache.flink.annotation.Public; import org.apache.flink.annotation.PublicEvolving; import org.apache.flink.api.common.ExecutionConfig; import org.apache.flink.api.common.InvalidProgramException; import org.apache.flink.api.common.JobExecutionResult; import org.apache.flink.api.common.RuntimeExecutionMode; import org.apache.flink.api.common.cache.DistributedCache; import org.apache.flink.api.common.eventtime.WatermarkStrategy; import org.apache.flink.api.common.functions.InvalidTypesException; import org.apache.flink.api.common.io.FileInputFormat; import org.apache.flink.api.common.io.FilePathFilter; import org.apache.flink.api.common.io.InputFormat; import org.apache.flink.api.common.restartstrategy.RestartStrategies; import org.apache.flink.api.common.time.Time; import org.apache.flink.api.common.typeinfo.BasicTypeInfo; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.connector.source.Boundedness; import org.apache.flink.api.connector.source.Source; import org.apache.flink.api.connector.source.lib.NumberSequenceSource; import org.apache.flink.api.dag.Transformation; import org.apache.flink.api.java.ClosureCleaner; import org.apache.flink.api.java.Utils; import org.apache.flink.api.java.io.TextInputFormat; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.tuple.Tuple3; import org.apache.flink.api.java.typeutils.MissingTypeInfo; import org.apache.flink.api.java.typeutils.PojoTypeInfo; import org.apache.flink.api.java.typeutils.ResultTypeQueryable; import org.apache.flink.api.java.typeutils.TypeExtractor; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.CoreOptions; import org.apache.flink.configuration.DeploymentOptions; import org.apache.flink.configuration.ExecutionOptions; import org.apache.flink.configuration.PipelineOptions; import org.apache.flink.configuration.ReadableConfig; import org.apache.flink.configuration.RestOptions; import org.apache.flink.core.execution.DefaultExecutorServiceLoader; import org.apache.flink.core.execution.DetachedJobExecutionResult; import org.apache.flink.core.execution.JobClient; import org.apache.flink.core.execution.JobListener; import org.apache.flink.core.execution.PipelineExecutor; import org.apache.flink.core.execution.PipelineExecutorFactory; import org.apache.flink.core.execution.PipelineExecutorServiceLoader; import org.apache.flink.core.fs.Path; import org.apache.flink.runtime.state.KeyGroupRangeAssignment; import org.apache.flink.runtime.state.StateBackend; import org.apache.flink.runtime.state.StateBackendLoader; import org.apache.flink.streaming.api.CheckpointingMode; import org.apache.flink.streaming.api.TimeCharacteristic; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction; import org.apache.flink.streaming.api.functions.source.ContinuousFileReaderOperatorFactory; import org.apache.flink.streaming.api.functions.source.FileMonitoringFunction; import org.apache.flink.streaming.api.functions.source.FileProcessingMode; import org.apache.flink.streaming.api.functions.source.FileReadFunction; import org.apache.flink.streaming.api.functions.source.FromElementsFunction; import org.apache.flink.streaming.api.functions.source.FromIteratorFunction; import org.apache.flink.streaming.api.functions.source.FromSplittableIteratorFunction; import org.apache.flink.streaming.api.functions.source.InputFormatSourceFunction; import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction; import org.apache.flink.streaming.api.functions.source.SocketTextStreamFunction; import org.apache.flink.streaming.api.functions.source.SourceFunction; import org.apache.flink.streaming.api.functions.source.StatefulSequenceSource; import org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit; import org.apache.flink.streaming.api.graph.StreamGraph; import org.apache.flink.streaming.api.graph.StreamGraphGenerator; import org.apache.flink.streaming.api.graph.StreamingJobGraphGenerator; import org.apache.flink.streaming.api.operators.StreamSource; import org.apache.flink.util.DynamicCodeLoadingException; import org.apache.flink.util.ExceptionUtils; import org.apache.flink.util.FlinkException; import org.apache.flink.util.InstantiationUtil; import org.apache.flink.util.Preconditions; import org.apache.flink.util.SplittableIterator; import org.apache.flink.util.StringUtils; import org.apache.flink.util.WrappingRuntimeException; import com.esotericsoftware.kryo.Serializer; import javax.annotation.Nullable; import java.io.IOException; import java.io.Serializable; import java.net.URI; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; import static org.apache.flink.util.Preconditions.checkNotNull; /** * The StreamExecutionEnvironment is the context in which a streaming program is executed. A {@link * LocalStreamEnvironment} will cause execution in the current JVM, a {@link * RemoteStreamEnvironment} will cause execution on a remote setup. * *

The environment provides methods to control the job execution (such as setting the parallelism * or the fault tolerance/checkpointing parameters) and to interact with the outside world (data * access). * * @see org.apache.flink.streaming.api.environment.LocalStreamEnvironment * @see org.apache.flink.streaming.api.environment.RemoteStreamEnvironment */ @Public public class StreamExecutionEnvironment { /** The default name to use for a streaming job if no other name has been specified. */ public static final String DEFAULT_JOB_NAME = "Flink Streaming Job"; /** The time characteristic that is used if none other is set. */ private static final TimeCharacteristic DEFAULT_TIME_CHARACTERISTIC = TimeCharacteristic.EventTime; /** * The environment of the context (local by default, cluster if invoked through command line). */ private static StreamExecutionEnvironmentFactory contextEnvironmentFactory = null; /** The ThreadLocal used to store {@link StreamExecutionEnvironmentFactory}. */ private static final ThreadLocal threadLocalContextEnvironmentFactory = new ThreadLocal<>(); /** The default parallelism used when creating a local environment. */ private static int defaultLocalParallelism = Runtime.getRuntime().availableProcessors(); // ------------------------------------------------------------------------ /** The execution configuration for this environment. */ private final ExecutionConfig config = new ExecutionConfig(); /** Settings that control the checkpointing behavior. */ private final CheckpointConfig checkpointCfg = new CheckpointConfig(); protected final List> transformations = new ArrayList<>(); private long bufferTimeout = StreamingJobGraphGenerator.UNDEFINED_NETWORK_BUFFER_TIMEOUT; protected boolean isChainingEnabled = true; /** The state backend used for storing k/v state and state snapshots. */ private StateBackend defaultStateBackend; /** The default savepoint directory used by the job. */ private Path defaultSavepointDirectory; /** The time characteristic used by the data streams. */ private TimeCharacteristic timeCharacteristic = DEFAULT_TIME_CHARACTERISTIC; protected final List> cacheFile = new ArrayList<>(); private final PipelineExecutorServiceLoader executorServiceLoader; // TODO: ------------ start:二次开发代码 --------------- // protected final Configuration configuration; // TODO: ------------ end:二次开发代码 --------------- // private final ClassLoader userClassloader; private final List jobListeners = new ArrayList<>(); // -------------------------------------------------------------------------------------------- // Constructor and Properties // -------------------------------------------------------------------------------------------- public StreamExecutionEnvironment() { this(new Configuration()); // unfortunately, StreamExecutionEnvironment always (implicitly) had a public constructor. // This constructor is not useful because the execution environment cannot be used for // execution. We're keeping this to appease the binary compatibiliy checks. } /** * Creates a new {@link StreamExecutionEnvironment} that will use the given {@link * Configuration} to configure the {@link PipelineExecutor}. */ @PublicEvolving public StreamExecutionEnvironment(final Configuration configuration) { this(configuration, null); } /** * Creates a new {@link StreamExecutionEnvironment} that will use the given {@link * Configuration} to configure the {@link PipelineExecutor}. * *

In addition, this constructor allows specifying the user code {@link ClassLoader}. */ @PublicEvolving public StreamExecutionEnvironment( final Configuration configuration, final ClassLoader userClassloader) { this(new DefaultExecutorServiceLoader(), configuration, userClassloader); } /** * Creates a new {@link StreamExecutionEnvironment} that will use the given {@link * Configuration} to configure the {@link PipelineExecutor}. * *

In addition, this constructor allows specifying the {@link PipelineExecutorServiceLoader} * and user code {@link ClassLoader}. */ @PublicEvolving public StreamExecutionEnvironment( final PipelineExecutorServiceLoader executorServiceLoader, final Configuration configuration, final ClassLoader userClassloader) { this.executorServiceLoader = checkNotNull(executorServiceLoader); this.configuration = new Configuration(checkNotNull(configuration)); this.userClassloader = userClassloader == null ? getClass().getClassLoader() : userClassloader; // the configuration of a job or an operator can be specified at the following places: // i) at the operator level using e.g. parallelism using the // SingleOutputStreamOperator.setParallelism(). // ii) programmatically by using e.g. the env.setRestartStrategy() method // iii) in the configuration passed here // // if specified in multiple places, the priority order is the above. // // Given this, it is safe to overwrite the execution config default values here because all // other ways assume // that the env is already instantiated so they will overwrite the value passed here. this.configure(this.configuration, this.userClassloader); } protected Configuration getConfiguration() { return this.configuration; } protected ClassLoader getUserClassloader() { return userClassloader; } /** Gets the config object. */ public ExecutionConfig getConfig() { return config; } /** * Get the list of cached files that were registered for distribution among the task managers. */ public List> getCachedFiles() { return cacheFile; } /** Gets the config JobListeners. */ @PublicEvolving public List getJobListeners() { return jobListeners; } /** * Sets the parallelism for operations executed through this environment. Setting a parallelism * of x here will cause all operators (such as map, batchReduce) to run with x parallel * instances. This method overrides the default parallelism for this environment. The {@link * LocalStreamEnvironment} uses by default a value equal to the number of hardware contexts (CPU * cores / threads). When executing the program via the command line client from a JAR file, the * default degree of parallelism is the one configured for that setup. * * @param parallelism The parallelism */ public StreamExecutionEnvironment setParallelism(int parallelism) { config.setParallelism(parallelism); return this; } /** * Sets the runtime execution mode for the application (see {@link RuntimeExecutionMode}). This * is equivalent to setting the {@code execution.runtime-mode} in your application's * configuration file. * *

We recommend users to NOT use this method but set the {@code execution.runtime-mode} using * the command-line when submitting the application. Keeping the application code * configuration-free allows for more flexibility as the same application will be able to be * executed in any execution mode. * * @param executionMode the desired execution mode. * @return The execution environment of your application. */ @PublicEvolving public StreamExecutionEnvironment setRuntimeMode(final RuntimeExecutionMode executionMode) { checkNotNull(executionMode); configuration.set(ExecutionOptions.RUNTIME_MODE, executionMode); return this; } /** * Sets the maximum degree of parallelism defined for the program. The upper limit (inclusive) * is Short.MAX_VALUE. * *

The maximum degree of parallelism specifies the upper limit for dynamic scaling. It also * defines the number of key groups used for partitioned state. * * @param maxParallelism Maximum degree of parallelism to be used for the program., with {@code * 0 < maxParallelism <= 2^15 - 1}. */ public StreamExecutionEnvironment setMaxParallelism(int maxParallelism) { Preconditions.checkArgument( maxParallelism > 0 && maxParallelism <= KeyGroupRangeAssignment.UPPER_BOUND_MAX_PARALLELISM, "maxParallelism is out of bounds 0 < maxParallelism <= " + KeyGroupRangeAssignment.UPPER_BOUND_MAX_PARALLELISM + ". Found: " + maxParallelism); config.setMaxParallelism(maxParallelism); return this; } /** * Gets the parallelism with which operation are executed by default. Operations can * individually override this value to use a specific parallelism. * * @return The parallelism used by operations, unless they override that value. */ public int getParallelism() { return config.getParallelism(); } /** * Gets the maximum degree of parallelism defined for the program. * *

The maximum degree of parallelism specifies the upper limit for dynamic scaling. It also * defines the number of key groups used for partitioned state. * * @return Maximum degree of parallelism */ public int getMaxParallelism() { return config.getMaxParallelism(); } /** * Sets the maximum time frequency (milliseconds) for the flushing of the output buffers. By * default the output buffers flush frequently to provide low latency and to aid smooth * developer experience. Setting the parameter can result in three logical modes: * *

    *
  • A positive integer triggers flushing periodically by that integer *
  • 0 triggers flushing after every record thus minimizing latency *
  • -1 triggers flushing only when the output buffer is full thus maximizing throughput *
* * @param timeoutMillis The maximum time between two output flushes. */ public StreamExecutionEnvironment setBufferTimeout(long timeoutMillis) { if (timeoutMillis < -1) { throw new IllegalArgumentException("Timeout of buffer must be non-negative or -1"); } this.bufferTimeout = timeoutMillis; return this; } /** * Gets the maximum time frequency (milliseconds) for the flushing of the output buffers. For * clarification on the extremal values see {@link #setBufferTimeout(long)}. * * @return The timeout of the buffer. */ public long getBufferTimeout() { return this.bufferTimeout; } /** * Disables operator chaining for streaming operators. Operator chaining allows non-shuffle * operations to be co-located in the same thread fully avoiding serialization and * de-serialization. * * @return StreamExecutionEnvironment with chaining disabled. */ @PublicEvolving public StreamExecutionEnvironment disableOperatorChaining() { this.isChainingEnabled = false; return this; } /** * Returns whether operator chaining is enabled. * * @return {@code true} if chaining is enabled, false otherwise. */ @PublicEvolving public boolean isChainingEnabled() { return isChainingEnabled; } // ------------------------------------------------------------------------ // Checkpointing Settings // ------------------------------------------------------------------------ /** * Gets the checkpoint config, which defines values like checkpoint interval, delay between * checkpoints, etc. * * @return The checkpoint config. */ public CheckpointConfig getCheckpointConfig() { return checkpointCfg; } /** * Enables checkpointing for the streaming job. The distributed state of the streaming dataflow * will be periodically snapshotted. In case of a failure, the streaming dataflow will be * restarted from the latest completed checkpoint. This method selects {@link * CheckpointingMode#EXACTLY_ONCE} guarantees. * *

The job draws checkpoints periodically, in the given interval. The state will be stored in * the configured state backend. * *

NOTE: Checkpointing iterative streaming dataflows in not properly supported at the moment. * For that reason, iterative jobs will not be started if used with enabled checkpointing. To * override this mechanism, use the {@link #enableCheckpointing(long, CheckpointingMode, * boolean)} method. * * @param interval Time interval between state checkpoints in milliseconds. */ public StreamExecutionEnvironment enableCheckpointing(long interval) { checkpointCfg.setCheckpointInterval(interval); return this; } /** * Enables checkpointing for the streaming job. The distributed state of the streaming dataflow * will be periodically snapshotted. In case of a failure, the streaming dataflow will be * restarted from the latest completed checkpoint. * *

The job draws checkpoints periodically, in the given interval. The system uses the given * {@link CheckpointingMode} for the checkpointing ("exactly once" vs "at least once"). The * state will be stored in the configured state backend. * *

NOTE: Checkpointing iterative streaming dataflows in not properly supported at the moment. * For that reason, iterative jobs will not be started if used with enabled checkpointing. To * override this mechanism, use the {@link #enableCheckpointing(long, CheckpointingMode, * boolean)} method. * * @param interval Time interval between state checkpoints in milliseconds. * @param mode The checkpointing mode, selecting between "exactly once" and "at least once" * guaranteed. */ public StreamExecutionEnvironment enableCheckpointing(long interval, CheckpointingMode mode) { checkpointCfg.setCheckpointingMode(mode); checkpointCfg.setCheckpointInterval(interval); return this; } /** * Enables checkpointing for the streaming job. The distributed state of the streaming dataflow * will be periodically snapshotted. In case of a failure, the streaming dataflow will be * restarted from the latest completed checkpoint. * *

The job draws checkpoints periodically, in the given interval. The state will be stored in * the configured state backend. * *

NOTE: Checkpointing iterative streaming dataflows in not properly supported at the moment. * If the "force" parameter is set to true, the system will execute the job nonetheless. * * @param interval Time interval between state checkpoints in millis. * @param mode The checkpointing mode, selecting between "exactly once" and "at least once" * guaranteed. * @param force If true checkpointing will be enabled for iterative jobs as well. * @deprecated Use {@link #enableCheckpointing(long, CheckpointingMode)} instead. Forcing * checkpoints will be removed in the future. */ @Deprecated @SuppressWarnings("deprecation") @PublicEvolving public StreamExecutionEnvironment enableCheckpointing( long interval, CheckpointingMode mode, boolean force) { checkpointCfg.setCheckpointingMode(mode); checkpointCfg.setCheckpointInterval(interval); checkpointCfg.setForceCheckpointing(force); return this; } /** * Enables checkpointing for the streaming job. The distributed state of the streaming dataflow * will be periodically snapshotted. In case of a failure, the streaming dataflow will be * restarted from the latest completed checkpoint. This method selects {@link * CheckpointingMode#EXACTLY_ONCE} guarantees. * *

The job draws checkpoints periodically, in the default interval. The state will be stored * in the configured state backend. * *

NOTE: Checkpointing iterative streaming dataflows in not properly supported at the moment. * For that reason, iterative jobs will not be started if used with enabled checkpointing. To * override this mechanism, use the {@link #enableCheckpointing(long, CheckpointingMode, * boolean)} method. * * @deprecated Use {@link #enableCheckpointing(long)} instead. */ @Deprecated @PublicEvolving public StreamExecutionEnvironment enableCheckpointing() { checkpointCfg.setCheckpointInterval(500); return this; } /** * Returns the checkpointing interval or -1 if checkpointing is disabled. * *

Shorthand for {@code getCheckpointConfig().getCheckpointInterval()}. * * @return The checkpointing interval or -1 */ public long getCheckpointInterval() { return checkpointCfg.getCheckpointInterval(); } /** * Returns whether checkpointing is force-enabled. * * @deprecated Forcing checkpoints will be removed in future version. */ @Deprecated @SuppressWarnings("deprecation") @PublicEvolving public boolean isForceCheckpointing() { return checkpointCfg.isForceCheckpointing(); } /** Returns whether unaligned checkpoints are enabled. */ @PublicEvolving public boolean isUnalignedCheckpointsEnabled() { return checkpointCfg.isUnalignedCheckpointsEnabled(); } /** Returns whether unaligned checkpoints are force-enabled. */ @PublicEvolving public boolean isForceUnalignedCheckpoints() { return checkpointCfg.isForceUnalignedCheckpoints(); } /** * Returns the checkpointing mode (exactly-once vs. at-least-once). * *

Shorthand for {@code getCheckpointConfig().getCheckpointingMode()}. * * @return The checkpoint mode */ public CheckpointingMode getCheckpointingMode() { return checkpointCfg.getCheckpointingMode(); } /** * Sets the state backend that describes how to store operator. It defines the data structures * that hold state during execution (for example hash tables, RocksDB, or other data stores). * *

State managed by the state backend includes both keyed state that is accessible on {@link * org.apache.flink.streaming.api.datastream.KeyedStream keyed streams}, as well as state * maintained directly by the user code that implements {@link * org.apache.flink.streaming.api.checkpoint.CheckpointedFunction CheckpointedFunction}. * *

The {@link org.apache.flink.runtime.state.hashmap.HashMapStateBackend} maintains state in * heap memory, as objects. It is lightweight without extra dependencies, but is limited to JVM * heap memory. * *

In contrast, the {@code EmbeddedRocksDBStateBackend} stores its state in an embedded * {@code RocksDB} instance. This state backend can store very large state that exceeds memory * and spills to local disk. All key/value state (including windows) is stored in the key/value * index of RocksDB. * *

In both cases, fault tolerance is managed via the jobs {@link * org.apache.flink.runtime.state.CheckpointStorage} which configures how and where state * backends persist during a checkpoint. * * @return This StreamExecutionEnvironment itself, to allow chaining of function calls. * @see #getStateBackend() * @see CheckpointConfig#setCheckpointStorage( org.apache.flink.runtime.state.CheckpointStorage) */ @PublicEvolving public StreamExecutionEnvironment setStateBackend(StateBackend backend) { this.defaultStateBackend = Preconditions.checkNotNull(backend); return this; } /** * Gets the state backend that defines how to store and checkpoint state. * * @see #setStateBackend(StateBackend) */ @PublicEvolving public StateBackend getStateBackend() { return defaultStateBackend; } /** * Sets the default savepoint directory, where savepoints will be written to if no is explicitly * provided when triggered. * * @return This StreamExecutionEnvironment itself, to allow chaining of function calls. * @see #getDefaultSavepointDirectory() */ @PublicEvolving public StreamExecutionEnvironment setDefaultSavepointDirectory(String savepointDirectory) { Preconditions.checkNotNull(savepointDirectory); return setDefaultSavepointDirectory(new Path(savepointDirectory)); } /** * Sets the default savepoint directory, where savepoints will be written to if no is explicitly * provided when triggered. * * @return This StreamExecutionEnvironment itself, to allow chaining of function calls. * @see #getDefaultSavepointDirectory() */ @PublicEvolving public StreamExecutionEnvironment setDefaultSavepointDirectory(URI savepointDirectory) { Preconditions.checkNotNull(savepointDirectory); return setDefaultSavepointDirectory(new Path(savepointDirectory)); } /** * Sets the default savepoint directory, where savepoints will be written to if no is explicitly * provided when triggered. * * @return This StreamExecutionEnvironment itself, to allow chaining of function calls. * @see #getDefaultSavepointDirectory() */ @PublicEvolving public StreamExecutionEnvironment setDefaultSavepointDirectory(Path savepointDirectory) { this.defaultSavepointDirectory = Preconditions.checkNotNull(savepointDirectory); return this; } /** * Gets the default savepoint directory for this Job. * * @see #setDefaultSavepointDirectory(Path) */ @Nullable @PublicEvolving public Path getDefaultSavepointDirectory() { return defaultSavepointDirectory; } /** * Sets the restart strategy configuration. The configuration specifies which restart strategy * will be used for the execution graph in case of a restart. * * @param restartStrategyConfiguration Restart strategy configuration to be set */ @PublicEvolving public void setRestartStrategy( RestartStrategies.RestartStrategyConfiguration restartStrategyConfiguration) { config.setRestartStrategy(restartStrategyConfiguration); } /** * Returns the specified restart strategy configuration. * * @return The restart strategy configuration to be used */ @PublicEvolving public RestartStrategies.RestartStrategyConfiguration getRestartStrategy() { return config.getRestartStrategy(); } /** * Sets the number of times that failed tasks are re-executed. A value of zero effectively * disables fault tolerance. A value of {@code -1} indicates that the system default value (as * defined in the configuration) should be used. * * @param numberOfExecutionRetries The number of times the system will try to re-execute failed * tasks. * @deprecated This method will be replaced by {@link #setRestartStrategy}. The {@link * RestartStrategies#fixedDelayRestart(int, Time)} contains the number of execution retries. */ @Deprecated @PublicEvolving public void setNumberOfExecutionRetries(int numberOfExecutionRetries) { config.setNumberOfExecutionRetries(numberOfExecutionRetries); } /** * Gets the number of times the system will try to re-execute failed tasks. A value of {@code * -1} indicates that the system default value (as defined in the configuration) should be used. * * @return The number of times the system will try to re-execute failed tasks. * @deprecated This method will be replaced by {@link #getRestartStrategy}. */ @Deprecated @PublicEvolving public int getNumberOfExecutionRetries() { return config.getNumberOfExecutionRetries(); } // -------------------------------------------------------------------------------------------- // Registry for types and serializers // -------------------------------------------------------------------------------------------- /** * Adds a new Kryo default serializer to the Runtime. * *

Note that the serializer instance must be serializable (as defined by * java.io.Serializable), because it may be distributed to the worker nodes by java * serialization. * * @param type The class of the types serialized with the given serializer. * @param serializer The serializer to use. */ public & Serializable> void addDefaultKryoSerializer( Class type, T serializer) { config.addDefaultKryoSerializer(type, serializer); } /** * Adds a new Kryo default serializer to the Runtime. * * @param type The class of the types serialized with the given serializer. * @param serializerClass The class of the serializer to use. */ public void addDefaultKryoSerializer( Class type, Class> serializerClass) { config.addDefaultKryoSerializer(type, serializerClass); } /** * Registers the given type with a Kryo Serializer. * *

Note that the serializer instance must be serializable (as defined by * java.io.Serializable), because it may be distributed to the worker nodes by java * serialization. * * @param type The class of the types serialized with the given serializer. * @param serializer The serializer to use. */ public & Serializable> void registerTypeWithKryoSerializer( Class type, T serializer) { config.registerTypeWithKryoSerializer(type, serializer); } /** * Registers the given Serializer via its class as a serializer for the given type at the * KryoSerializer. * * @param type The class of the types serialized with the given serializer. * @param serializerClass The class of the serializer to use. */ @SuppressWarnings("rawtypes") public void registerTypeWithKryoSerializer( Class type, Class serializerClass) { config.registerTypeWithKryoSerializer(type, serializerClass); } /** * Registers the given type with the serialization stack. If the type is eventually serialized * as a POJO, then the type is registered with the POJO serializer. If the type ends up being * serialized with Kryo, then it will be registered at Kryo to make sure that only tags are * written. * * @param type The class of the type to register. */ public void registerType(Class type) { if (type == null) { throw new NullPointerException("Cannot register null type class."); } TypeInformation typeInfo = TypeExtractor.createTypeInfo(type); if (typeInfo instanceof PojoTypeInfo) { config.registerPojoType(type); } else { config.registerKryoType(type); } } // -------------------------------------------------------------------------------------------- // Time characteristic // -------------------------------------------------------------------------------------------- /** * Sets the time characteristic for all streams create from this environment, e.g., processing * time, event time, or ingestion time. * *

If you set the characteristic to IngestionTime of EventTime this will set a default * watermark update interval of 200 ms. If this is not applicable for your application you * should change it using {@link ExecutionConfig#setAutoWatermarkInterval(long)}. * * @param characteristic The time characteristic. * @deprecated In Flink 1.12 the default stream time characteristic has been changed to {@link * TimeCharacteristic#EventTime}, thus you don't need to call this method for enabling * event-time support anymore. Explicitly using processing-time windows and timers works in * event-time mode. If you need to disable watermarks, please use {@link * ExecutionConfig#setAutoWatermarkInterval(long)}. If you are using {@link * TimeCharacteristic#IngestionTime}, please manually set an appropriate {@link * WatermarkStrategy}. If you are using generic "time window" operations (for example {@link * org.apache.flink.streaming.api.datastream.KeyedStream#timeWindow(org.apache.flink.streaming.api.windowing.time.Time)} * that change behaviour based on the time characteristic, please use equivalent operations * that explicitly specify processing time or event time. */ @PublicEvolving @Deprecated public void setStreamTimeCharacteristic(TimeCharacteristic characteristic) { this.timeCharacteristic = Preconditions.checkNotNull(characteristic); if (characteristic == TimeCharacteristic.ProcessingTime) { getConfig().setAutoWatermarkInterval(0); } else { getConfig().setAutoWatermarkInterval(200); } } /** * Gets the time characteristic. * * @deprecated See {@link #setStreamTimeCharacteristic(TimeCharacteristic)} for deprecation * notice. */ @PublicEvolving @Deprecated public TimeCharacteristic getStreamTimeCharacteristic() { return timeCharacteristic; } /** * Sets all relevant options contained in the {@link ReadableConfig} such as e.g. {@link * StreamPipelineOptions#TIME_CHARACTERISTIC}. It will reconfigure {@link * StreamExecutionEnvironment}, {@link ExecutionConfig} and {@link CheckpointConfig}. * *

It will change the value of a setting only if a corresponding option was set in the {@code * configuration}. If a key is not present, the current value of a field will remain untouched. * * @param configuration a configuration to read the values from * @param classLoader a class loader to use when loading classes */ @PublicEvolving public void configure(ReadableConfig configuration, ClassLoader classLoader) { configuration .getOptional(StreamPipelineOptions.TIME_CHARACTERISTIC) .ifPresent(this::setStreamTimeCharacteristic); Optional.ofNullable(loadStateBackend(configuration, classLoader)) .ifPresent(this::setStateBackend); configuration .getOptional(PipelineOptions.OPERATOR_CHAINING) .ifPresent(c -> this.isChainingEnabled = c); configuration .getOptional(ExecutionOptions.BUFFER_TIMEOUT) .ifPresent(t -> this.setBufferTimeout(t.toMillis())); configuration .getOptional(DeploymentOptions.JOB_LISTENERS) .ifPresent(listeners -> registerCustomListeners(classLoader, listeners)); configuration .getOptional(PipelineOptions.CACHED_FILES) .ifPresent( f -> { this.cacheFile.clear(); this.cacheFile.addAll(DistributedCache.parseCachedFilesFromString(f)); }); configuration .getOptional(ExecutionOptions.RUNTIME_MODE) .ifPresent( runtimeMode -> this.configuration.set(ExecutionOptions.RUNTIME_MODE, runtimeMode)); configuration .getOptional(ExecutionOptions.SORT_INPUTS) .ifPresent( sortInputs -> this.getConfiguration() .set(ExecutionOptions.SORT_INPUTS, sortInputs)); configuration .getOptional(ExecutionOptions.USE_BATCH_STATE_BACKEND) .ifPresent( sortInputs -> this.getConfiguration() .set(ExecutionOptions.USE_BATCH_STATE_BACKEND, sortInputs)); configuration .getOptional(PipelineOptions.NAME) .ifPresent(jobName -> this.getConfiguration().set(PipelineOptions.NAME, jobName)); config.configure(configuration, classLoader); checkpointCfg.configure(configuration); } private void registerCustomListeners( final ClassLoader classLoader, final List listeners) { for (String listener : listeners) { try { final JobListener jobListener = InstantiationUtil.instantiate(listener, JobListener.class, classLoader); jobListeners.add(jobListener); } catch (FlinkException e) { throw new WrappingRuntimeException("Could not load JobListener : " + listener, e); } } } private StateBackend loadStateBackend(ReadableConfig configuration, ClassLoader classLoader) { try { return StateBackendLoader.loadStateBackendFromConfig(configuration, classLoader, null); } catch (DynamicCodeLoadingException | IOException e) { throw new WrappingRuntimeException(e); } } // -------------------------------------------------------------------------------------------- // Data stream creations // -------------------------------------------------------------------------------------------- /** * Creates a new data stream that contains a sequence of numbers. This is a parallel source, if * you manually set the parallelism to {@code 1} (using {@link * org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator#setParallelism(int)}) * the generated sequence of elements is in order. * * @param from The number to start at (inclusive) * @param to The number to stop at (inclusive) * @return A data stream, containing all number in the [from, to] interval * @deprecated Use {@link #fromSequence(long, long)} instead to create a new data stream that * contains {@link org.apache.flink.api.connector.source.lib.NumberSequenceSource}. */ @Deprecated public DataStreamSource generateSequence(long from, long to) { if (from > to) { throw new IllegalArgumentException( "Start of sequence must not be greater than the end"); } return addSource(new StatefulSequenceSource(from, to), "Sequence Source (Deprecated)"); } /** * Creates a new data stream that contains a sequence of numbers (longs) and is useful for * testing and for cases that just need a stream of N events of any kind. * *

The generated source splits the sequence into as many parallel sub-sequences as there are * parallel source readers. Each sub-sequence will be produced in order. If the parallelism is * limited to one, the source will produce one sequence in order. * *

This source is always bounded. For very long sequences (for example over the entire domain * of long integer values), you may consider executing the application in a streaming manner * because of the end bound that is pretty far away. * *

Use {@link #fromSource(Source, WatermarkStrategy, String)} together with {@link * NumberSequenceSource} if you required more control over the created sources. For example, if * you want to set a {@link WatermarkStrategy}. * * @param from The number to start at (inclusive) * @param to The number to stop at (inclusive) */ public DataStreamSource fromSequence(long from, long to) { if (from > to) { throw new IllegalArgumentException( "Start of sequence must not be greater than the end"); } return fromSource( new NumberSequenceSource(from, to), WatermarkStrategy.noWatermarks(), "Sequence Source"); } /** * Creates a new data stream that contains the given elements. The elements must all be of the * same type, for example, all of the {@link String} or {@link Integer}. * *

The framework will try and determine the exact type from the elements. In case of generic * elements, it may be necessary to manually supply the type information via {@link * #fromCollection(java.util.Collection, org.apache.flink.api.common.typeinfo.TypeInformation)}. * *

Note that this operation will result in a non-parallel data stream source, i.e. a data * stream source with a degree of parallelism one. * * @param data The array of elements to create the data stream from. * @param The type of the returned data stream * @return The data stream representing the given array of elements */ @SafeVarargs public final DataStreamSource fromElements(OUT... data) { if (data.length == 0) { throw new IllegalArgumentException( "fromElements needs at least one element as argument"); } TypeInformation typeInfo; try { typeInfo = TypeExtractor.getForObject(data[0]); } catch (Exception e) { throw new RuntimeException( "Could not create TypeInformation for type " + data[0].getClass().getName() + "; please specify the TypeInformation manually via " + "StreamExecutionEnvironment#fromElements(Collection, TypeInformation)", e); } return fromCollection(Arrays.asList(data), typeInfo); } /** * Creates a new data set that contains the given elements. The framework will determine the * type according to the based type user supplied. The elements should be the same or be the * subclass to the based type. The sequence of elements must not be empty. Note that this * operation will result in a non-parallel data stream source, i.e. a data stream source with a * degree of parallelism one. * * @param type The based class type in the collection. * @param data The array of elements to create the data stream from. * @param The type of the returned data stream * @return The data stream representing the given array of elements */ @SafeVarargs public final DataStreamSource fromElements(Class type, OUT... data) { if (data.length == 0) { throw new IllegalArgumentException( "fromElements needs at least one element as argument"); } TypeInformation typeInfo; try { typeInfo = TypeExtractor.getForClass(type); } catch (Exception e) { throw new RuntimeException( "Could not create TypeInformation for type " + type.getName() + "; please specify the TypeInformation manually via " + "StreamExecutionEnvironment#fromElements(Collection, TypeInformation)", e); } return fromCollection(Arrays.asList(data), typeInfo); } /** * Creates a data stream from the given non-empty collection. The type of the data stream is * that of the elements in the collection. * *

The framework will try and determine the exact type from the collection elements. In case * of generic elements, it may be necessary to manually supply the type information via {@link * #fromCollection(java.util.Collection, org.apache.flink.api.common.typeinfo.TypeInformation)}. * *

Note that this operation will result in a non-parallel data stream source, i.e. a data * stream source with parallelism one. * * @param data The collection of elements to create the data stream from. * @param The generic type of the returned data stream. * @return The data stream representing the given collection */ public DataStreamSource fromCollection(Collection data) { Preconditions.checkNotNull(data, "Collection must not be null"); if (data.isEmpty()) { throw new IllegalArgumentException("Collection must not be empty"); } OUT first = data.iterator().next(); if (first == null) { throw new IllegalArgumentException("Collection must not contain null elements"); } TypeInformation typeInfo; try { typeInfo = TypeExtractor.getForObject(first); } catch (Exception e) { throw new RuntimeException( "Could not create TypeInformation for type " + first.getClass() + "; please specify the TypeInformation manually via " + "StreamExecutionEnvironment#fromElements(Collection, TypeInformation)", e); } return fromCollection(data, typeInfo); } /** * Creates a data stream from the given non-empty collection. * *

Note that this operation will result in a non-parallel data stream source, i.e., a data * stream source with parallelism one. * * @param data The collection of elements to create the data stream from * @param typeInfo The TypeInformation for the produced data stream * @param The type of the returned data stream * @return The data stream representing the given collection */ public DataStreamSource fromCollection( Collection data, TypeInformation typeInfo) { Preconditions.checkNotNull(data, "Collection must not be null"); // must not have null elements and mixed elements FromElementsFunction.checkCollection(data, typeInfo.getTypeClass()); SourceFunction function = new FromElementsFunction<>(data); return addSource(function, "Collection Source", typeInfo, Boundedness.BOUNDED) .setParallelism(1); } /** * Creates a data stream from the given iterator. * *

Because the iterator will remain unmodified until the actual execution happens, the type * of data returned by the iterator must be given explicitly in the form of the type class (this * is due to the fact that the Java compiler erases the generic type information). * *

Note that this operation will result in a non-parallel data stream source, i.e., a data * stream source with a parallelism of one. * * @param data The iterator of elements to create the data stream from * @param type The class of the data produced by the iterator. Must not be a generic class. * @param The type of the returned data stream * @return The data stream representing the elements in the iterator * @see #fromCollection(java.util.Iterator, * org.apache.flink.api.common.typeinfo.TypeInformation) */ public DataStreamSource fromCollection(Iterator data, Class type) { return fromCollection(data, TypeExtractor.getForClass(type)); } /** * Creates a data stream from the given iterator. * *

Because the iterator will remain unmodified until the actual execution happens, the type * of data returned by the iterator must be given explicitly in the form of the type * information. This method is useful for cases where the type is generic. In that case, the * type class (as given in {@link #fromCollection(java.util.Iterator, Class)} does not supply * all type information. * *

Note that this operation will result in a non-parallel data stream source, i.e., a data * stream source with parallelism one. * * @param data The iterator of elements to create the data stream from * @param typeInfo The TypeInformation for the produced data stream * @param The type of the returned data stream * @return The data stream representing the elements in the iterator */ public DataStreamSource fromCollection( Iterator data, TypeInformation typeInfo) { Preconditions.checkNotNull(data, "The iterator must not be null"); SourceFunction function = new FromIteratorFunction<>(data); return addSource(function, "Collection Source", typeInfo, Boundedness.BOUNDED); } /** * Creates a new data stream that contains elements in the iterator. The iterator is splittable, * allowing the framework to create a parallel data stream source that returns the elements in * the iterator. * *

Because the iterator will remain unmodified until the actual execution happens, the type * of data returned by the iterator must be given explicitly in the form of the type class (this * is due to the fact that the Java compiler erases the generic type information). * * @param iterator The iterator that produces the elements of the data stream * @param type The class of the data produced by the iterator. Must not be a generic class. * @param The type of the returned data stream * @return A data stream representing the elements in the iterator */ public DataStreamSource fromParallelCollection( SplittableIterator iterator, Class type) { return fromParallelCollection(iterator, TypeExtractor.getForClass(type)); } /** * Creates a new data stream that contains elements in the iterator. The iterator is splittable, * allowing the framework to create a parallel data stream source that returns the elements in * the iterator. * *

Because the iterator will remain unmodified until the actual execution happens, the type * of data returned by the iterator must be given explicitly in the form of the type * information. This method is useful for cases where the type is generic. In that case, the * type class (as given in {@link * #fromParallelCollection(org.apache.flink.util.SplittableIterator, Class)} does not supply all * type information. * * @param iterator The iterator that produces the elements of the data stream * @param typeInfo The TypeInformation for the produced data stream. * @param The type of the returned data stream * @return A data stream representing the elements in the iterator */ public DataStreamSource fromParallelCollection( SplittableIterator iterator, TypeInformation typeInfo) { return fromParallelCollection(iterator, typeInfo, "Parallel Collection Source"); } // private helper for passing different names private DataStreamSource fromParallelCollection( SplittableIterator iterator, TypeInformation typeInfo, String operatorName) { return addSource( new FromSplittableIteratorFunction<>(iterator), operatorName, typeInfo, Boundedness.BOUNDED); } /** * Reads the given file line-by-line and creates a data stream that contains a string with the * contents of each such line. The file will be read with the UTF-8 character set. * *

NOTES ON CHECKPOINTING: The source monitors the path, creates the {@link * org.apache.flink.core.fs.FileInputSplit FileInputSplits} to be processed, forwards them to * the downstream readers to read the actual data, and exits, without waiting for the readers to * finish reading. This implies that no more checkpoint barriers are going to be forwarded after * the source exits, thus having no checkpoints after that point. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or * "hdfs://host:port/file/path"). * @return The data stream that represents the data read from the given file as text lines */ public DataStreamSource readTextFile(String filePath) { return readTextFile(filePath, "UTF-8"); } /** * Reads the given file line-by-line and creates a data stream that contains a string with the * contents of each such line. The {@link java.nio.charset.Charset} with the given name will be * used to read the files. * *

NOTES ON CHECKPOINTING: The source monitors the path, creates the {@link * org.apache.flink.core.fs.FileInputSplit FileInputSplits} to be processed, forwards them to * the downstream readers to read the actual data, and exits, without waiting for the readers to * finish reading. This implies that no more checkpoint barriers are going to be forwarded after * the source exits, thus having no checkpoints after that point. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or * "hdfs://host:port/file/path") * @param charsetName The name of the character set used to read the file * @return The data stream that represents the data read from the given file as text lines */ public DataStreamSource readTextFile(String filePath, String charsetName) { Preconditions.checkArgument( !StringUtils.isNullOrWhitespaceOnly(filePath), "The file path must not be null or blank."); TextInputFormat format = new TextInputFormat(new Path(filePath)); format.setFilesFilter(FilePathFilter.createDefaultFilter()); TypeInformation typeInfo = BasicTypeInfo.STRING_TYPE_INFO; format.setCharsetName(charsetName); return readFile(format, filePath, FileProcessingMode.PROCESS_ONCE, -1, typeInfo); } /** * Reads the contents of the user-specified {@code filePath} based on the given {@link * FileInputFormat}. * *

Since all data streams need specific information about their types, this method needs to * determine the type of the data produced by the input format. It will attempt to determine the * data type by reflection, unless the input format implements the {@link * org.apache.flink.api.java.typeutils.ResultTypeQueryable} interface. In the latter case, this * method will invoke the {@link * org.apache.flink.api.java.typeutils.ResultTypeQueryable#getProducedType()} method to * determine data type produced by the input format. * *

NOTES ON CHECKPOINTING: The source monitors the path, creates the {@link * org.apache.flink.core.fs.FileInputSplit FileInputSplits} to be processed, forwards them to * the downstream readers to read the actual data, and exits, without waiting for the readers to * finish reading. This implies that no more checkpoint barriers are going to be forwarded after * the source exits, thus having no checkpoints after that point. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or * "hdfs://host:port/file/path") * @param inputFormat The input format used to create the data stream * @param The type of the returned data stream * @return The data stream that represents the data read from the given file */ public DataStreamSource readFile(FileInputFormat inputFormat, String filePath) { return readFile(inputFormat, filePath, FileProcessingMode.PROCESS_ONCE, -1); } /** * Reads the contents of the user-specified {@code filePath} based on the given {@link * FileInputFormat}. Depending on the provided {@link FileProcessingMode}. * *

See {@link #readFile(FileInputFormat, String, FileProcessingMode, long)} * * @param inputFormat The input format used to create the data stream * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or * "hdfs://host:port/file/path") * @param watchType The mode in which the source should operate, i.e. monitor path and react to * new data, or process once and exit * @param interval In the case of periodic path monitoring, this specifies the interval (in * millis) between consecutive path scans * @param filter The files to be excluded from the processing * @param The type of the returned data stream * @return The data stream that represents the data read from the given file * @deprecated Use {@link FileInputFormat#setFilesFilter(FilePathFilter)} to set a filter and * {@link StreamExecutionEnvironment#readFile(FileInputFormat, String, FileProcessingMode, * long)} */ @PublicEvolving @Deprecated public DataStreamSource readFile( FileInputFormat inputFormat, String filePath, FileProcessingMode watchType, long interval, FilePathFilter filter) { inputFormat.setFilesFilter(filter); TypeInformation typeInformation; try { typeInformation = TypeExtractor.getInputFormatTypes(inputFormat); } catch (Exception e) { throw new InvalidProgramException( "The type returned by the input format could not be " + "automatically determined. Please specify the TypeInformation of the produced type " + "explicitly by using the 'createInput(InputFormat, TypeInformation)' method instead."); } return readFile(inputFormat, filePath, watchType, interval, typeInformation); } /** * Reads the contents of the user-specified {@code filePath} based on the given {@link * FileInputFormat}. Depending on the provided {@link FileProcessingMode}, the source may * periodically monitor (every {@code interval} ms) the path for new data ({@link * FileProcessingMode#PROCESS_CONTINUOUSLY}), or process once the data currently in the path and * exit ({@link FileProcessingMode#PROCESS_ONCE}). In addition, if the path contains files not * to be processed, the user can specify a custom {@link FilePathFilter}. As a default * implementation you can use {@link FilePathFilter#createDefaultFilter()}. * *

Since all data streams need specific information about their types, this method needs to * determine the type of the data produced by the input format. It will attempt to determine the * data type by reflection, unless the input format implements the {@link * org.apache.flink.api.java.typeutils.ResultTypeQueryable} interface. In the latter case, this * method will invoke the {@link * org.apache.flink.api.java.typeutils.ResultTypeQueryable#getProducedType()} method to * determine data type produced by the input format. * *

NOTES ON CHECKPOINTING: If the {@code watchType} is set to {@link * FileProcessingMode#PROCESS_ONCE}, the source monitors the path once, creates the * {@link org.apache.flink.core.fs.FileInputSplit FileInputSplits} to be processed, forwards * them to the downstream readers to read the actual data, and exits, without waiting for the * readers to finish reading. This implies that no more checkpoint barriers are going to be * forwarded after the source exits, thus having no checkpoints after that point. * * @param inputFormat The input format used to create the data stream * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or * "hdfs://host:port/file/path") * @param watchType The mode in which the source should operate, i.e. monitor path and react to * new data, or process once and exit * @param interval In the case of periodic path monitoring, this specifies the interval (in * millis) between consecutive path scans * @param The type of the returned data stream * @return The data stream that represents the data read from the given file */ @PublicEvolving public DataStreamSource readFile( FileInputFormat inputFormat, String filePath, FileProcessingMode watchType, long interval) { TypeInformation typeInformation; try { typeInformation = TypeExtractor.getInputFormatTypes(inputFormat); } catch (Exception e) { throw new InvalidProgramException( "The type returned by the input format could not be " + "automatically determined. Please specify the TypeInformation of the produced type " + "explicitly by using the 'createInput(InputFormat, TypeInformation)' method instead."); } return readFile(inputFormat, filePath, watchType, interval, typeInformation); } /** * Creates a data stream that contains the contents of file created while system watches the * given path. The file will be read with the system's default character set. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or * "hdfs://host:port/file/path/") * @param intervalMillis The interval of file watching in milliseconds * @param watchType The watch type of file stream. When watchType is {@link * org.apache.flink.streaming.api.functions.source.FileMonitoringFunction.WatchType#ONLY_NEW_FILES}, * the system processes only new files. {@link * org.apache.flink.streaming.api.functions.source.FileMonitoringFunction.WatchType#REPROCESS_WITH_APPENDED} * means that the system re-processes all contents of appended file. {@link * org.apache.flink.streaming.api.functions.source.FileMonitoringFunction.WatchType#PROCESS_ONLY_APPENDED} * means that the system processes only appended contents of files. * @return The DataStream containing the given directory. * @deprecated Use {@link #readFile(FileInputFormat, String, FileProcessingMode, long)} instead. */ @Deprecated @SuppressWarnings("deprecation") public DataStream readFileStream( String filePath, long intervalMillis, FileMonitoringFunction.WatchType watchType) { DataStream> source = addSource( new FileMonitoringFunction(filePath, intervalMillis, watchType), "Read File Stream source"); return source.flatMap(new FileReadFunction()); } /** * Reads the contents of the user-specified {@code filePath} based on the given {@link * FileInputFormat}. Depending on the provided {@link FileProcessingMode}, the source may * periodically monitor (every {@code interval} ms) the path for new data ({@link * FileProcessingMode#PROCESS_CONTINUOUSLY}), or process once the data currently in the path and * exit ({@link FileProcessingMode#PROCESS_ONCE}). In addition, if the path contains files not * to be processed, the user can specify a custom {@link FilePathFilter}. As a default * implementation you can use {@link FilePathFilter#createDefaultFilter()}. * *

NOTES ON CHECKPOINTING: If the {@code watchType} is set to {@link * FileProcessingMode#PROCESS_ONCE}, the source monitors the path once, creates the * {@link org.apache.flink.core.fs.FileInputSplit FileInputSplits} to be processed, forwards * them to the downstream readers to read the actual data, and exits, without waiting for the * readers to finish reading. This implies that no more checkpoint barriers are going to be * forwarded after the source exits, thus having no checkpoints after that point. * * @param inputFormat The input format used to create the data stream * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or * "hdfs://host:port/file/path") * @param watchType The mode in which the source should operate, i.e. monitor path and react to * new data, or process once and exit * @param typeInformation Information on the type of the elements in the output stream * @param interval In the case of periodic path monitoring, this specifies the interval (in * millis) between consecutive path scans * @param The type of the returned data stream * @return The data stream that represents the data read from the given file */ @PublicEvolving public DataStreamSource readFile( FileInputFormat inputFormat, String filePath, FileProcessingMode watchType, long interval, TypeInformation typeInformation) { Preconditions.checkNotNull(inputFormat, "InputFormat must not be null."); Preconditions.checkArgument( !StringUtils.isNullOrWhitespaceOnly(filePath), "The file path must not be null or blank."); inputFormat.setFilePath(filePath); return createFileInput( inputFormat, typeInformation, "Custom File Source", watchType, interval); } /** * Creates a new data stream that contains the strings received infinitely from a socket. * Received strings are decoded by the system's default character set. On the termination of the * socket server connection retries can be initiated. * *

Let us note that the socket itself does not report on abort and as a consequence retries * are only initiated when the socket was gracefully terminated. * * @param hostname The host name which a server socket binds * @param port The port number which a server socket binds. A port number of 0 means that the * port number is automatically allocated. * @param delimiter A character which splits received strings into records * @param maxRetry The maximal retry interval in seconds while the program waits for a socket * that is temporarily down. Reconnection is initiated every second. A number of 0 means * that the reader is immediately terminated, while a negative value ensures retrying * forever. * @return A data stream containing the strings received from the socket * @deprecated Use {@link #socketTextStream(String, int, String, long)} instead. */ @Deprecated public DataStreamSource socketTextStream( String hostname, int port, char delimiter, long maxRetry) { return socketTextStream(hostname, port, String.valueOf(delimiter), maxRetry); } /** * Creates a new data stream that contains the strings received infinitely from a socket. * Received strings are decoded by the system's default character set. On the termination of the * socket server connection retries can be initiated. * *

Let us note that the socket itself does not report on abort and as a consequence retries * are only initiated when the socket was gracefully terminated. * * @param hostname The host name which a server socket binds * @param port The port number which a server socket binds. A port number of 0 means that the * port number is automatically allocated. * @param delimiter A string which splits received strings into records * @param maxRetry The maximal retry interval in seconds while the program waits for a socket * that is temporarily down. Reconnection is initiated every second. A number of 0 means * that the reader is immediately terminated, while a negative value ensures retrying * forever. * @return A data stream containing the strings received from the socket */ @PublicEvolving public DataStreamSource socketTextStream( String hostname, int port, String delimiter, long maxRetry) { return addSource( new SocketTextStreamFunction(hostname, port, delimiter, maxRetry), "Socket Stream"); } /** * Creates a new data stream that contains the strings received infinitely from a socket. * Received strings are decoded by the system's default character set. The reader is terminated * immediately when the socket is down. * * @param hostname The host name which a server socket binds * @param port The port number which a server socket binds. A port number of 0 means that the * port number is automatically allocated. * @param delimiter A character which splits received strings into records * @return A data stream containing the strings received from the socket * @deprecated Use {@link #socketTextStream(String, int, String)} instead. */ @Deprecated @SuppressWarnings("deprecation") public DataStreamSource socketTextStream(String hostname, int port, char delimiter) { return socketTextStream(hostname, port, delimiter, 0); } /** * Creates a new data stream that contains the strings received infinitely from a socket. * Received strings are decoded by the system's default character set. The reader is terminated * immediately when the socket is down. * * @param hostname The host name which a server socket binds * @param port The port number which a server socket binds. A port number of 0 means that the * port number is automatically allocated. * @param delimiter A string which splits received strings into records * @return A data stream containing the strings received from the socket */ @PublicEvolving public DataStreamSource socketTextStream(String hostname, int port, String delimiter) { return socketTextStream(hostname, port, delimiter, 0); } /** * Creates a new data stream that contains the strings received infinitely from a socket. * Received strings are decoded by the system's default character set, using"\n" as delimiter. * The reader is terminated immediately when the socket is down. * * @param hostname The host name which a server socket binds * @param port The port number which a server socket binds. A port number of 0 means that the * port number is automatically allocated. * @return A data stream containing the strings received from the socket */ @PublicEvolving public DataStreamSource socketTextStream(String hostname, int port) { return socketTextStream(hostname, port, "\n"); } /** * Generic method to create an input data stream with {@link * org.apache.flink.api.common.io.InputFormat}. * *

Since all data streams need specific information about their types, this method needs to * determine the type of the data produced by the input format. It will attempt to determine the * data type by reflection, unless the input format implements the {@link * org.apache.flink.api.java.typeutils.ResultTypeQueryable} interface. In the latter case, this * method will invoke the {@link * org.apache.flink.api.java.typeutils.ResultTypeQueryable#getProducedType()} method to * determine data type produced by the input format. * *

NOTES ON CHECKPOINTING: In the case of a {@link FileInputFormat}, the source * (which executes the {@link ContinuousFileMonitoringFunction}) monitors the path, creates the * {@link org.apache.flink.core.fs.FileInputSplit FileInputSplits} to be processed, forwards * them to the downstream readers to read the actual data, and exits, without waiting for the * readers to finish reading. This implies that no more checkpoint barriers are going to be * forwarded after the source exits, thus having no checkpoints. * * @param inputFormat The input format used to create the data stream * @param The type of the returned data stream * @return The data stream that represents the data created by the input format */ @PublicEvolving public DataStreamSource createInput(InputFormat inputFormat) { return createInput(inputFormat, TypeExtractor.getInputFormatTypes(inputFormat)); } /** * Generic method to create an input data stream with {@link * org.apache.flink.api.common.io.InputFormat}. * *

The data stream is typed to the given TypeInformation. This method is intended for input * formats where the return type cannot be determined by reflection analysis, and that do not * implement the {@link org.apache.flink.api.java.typeutils.ResultTypeQueryable} interface. * *

NOTES ON CHECKPOINTING: In the case of a {@link FileInputFormat}, the source * (which executes the {@link ContinuousFileMonitoringFunction}) monitors the path, creates the * {@link org.apache.flink.core.fs.FileInputSplit FileInputSplits} to be processed, forwards * them to the downstream readers to read the actual data, and exits, without waiting for the * readers to finish reading. This implies that no more checkpoint barriers are going to be * forwarded after the source exits, thus having no checkpoints. * * @param inputFormat The input format used to create the data stream * @param typeInfo The information about the type of the output type * @param The type of the returned data stream * @return The data stream that represents the data created by the input format */ @PublicEvolving public DataStreamSource createInput( InputFormat inputFormat, TypeInformation typeInfo) { DataStreamSource source; if (inputFormat instanceof FileInputFormat) { @SuppressWarnings("unchecked") FileInputFormat format = (FileInputFormat) inputFormat; source = createFileInput( format, typeInfo, "Custom File source", FileProcessingMode.PROCESS_ONCE, -1); } else { source = createInput(inputFormat, typeInfo, "Custom Source"); } return source; } private DataStreamSource createInput( InputFormat inputFormat, TypeInformation typeInfo, String sourceName) { InputFormatSourceFunction function = new InputFormatSourceFunction<>(inputFormat, typeInfo); return addSource(function, sourceName, typeInfo); } private DataStreamSource createFileInput( FileInputFormat inputFormat, TypeInformation typeInfo, String sourceName, FileProcessingMode monitoringMode, long interval) { Preconditions.checkNotNull(inputFormat, "Unspecified file input format."); Preconditions.checkNotNull(typeInfo, "Unspecified output type information."); Preconditions.checkNotNull(sourceName, "Unspecified name for the source."); Preconditions.checkNotNull(monitoringMode, "Unspecified monitoring mode."); Preconditions.checkArgument( monitoringMode.equals(FileProcessingMode.PROCESS_ONCE) || interval >= ContinuousFileMonitoringFunction.MIN_MONITORING_INTERVAL, "The path monitoring interval cannot be less than " + ContinuousFileMonitoringFunction.MIN_MONITORING_INTERVAL + " ms."); ContinuousFileMonitoringFunction monitoringFunction = new ContinuousFileMonitoringFunction<>( inputFormat, monitoringMode, getParallelism(), interval); ContinuousFileReaderOperatorFactory factory = new ContinuousFileReaderOperatorFactory<>(inputFormat); final Boundedness boundedness = monitoringMode == FileProcessingMode.PROCESS_ONCE ? Boundedness.BOUNDED : Boundedness.CONTINUOUS_UNBOUNDED; SingleOutputStreamOperator source = addSource(monitoringFunction, sourceName, null, boundedness) .transform("Split Reader: " + sourceName, typeInfo, factory); return new DataStreamSource<>(source); } /** * Adds a Data Source to the streaming topology. * *

By default sources have a parallelism of 1. To enable parallel execution, the user defined * source should implement {@link * org.apache.flink.streaming.api.functions.source.ParallelSourceFunction} or extend {@link * org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction}. In these cases * the resulting source will have the parallelism of the environment. To change this afterwards * call {@link org.apache.flink.streaming.api.datastream.DataStreamSource#setParallelism(int)} * * @param function the user defined function * @param type of the returned stream * @return the data stream constructed */ public DataStreamSource addSource(SourceFunction function) { return addSource(function, "Custom Source"); } /** * Adds a data source with a custom type information thus opening a {@link DataStream}. Only in * very special cases does the user need to support type information. Otherwise use {@link * #addSource(org.apache.flink.streaming.api.functions.source.SourceFunction)} * * @param function the user defined function * @param sourceName Name of the data source * @param type of the returned stream * @return the data stream constructed */ public DataStreamSource addSource(SourceFunction function, String sourceName) { return addSource(function, sourceName, null); } /** * Ads a data source with a custom type information thus opening a {@link DataStream}. Only in * very special cases does the user need to support type information. Otherwise use {@link * #addSource(org.apache.flink.streaming.api.functions.source.SourceFunction)} * * @param function the user defined function * @param type of the returned stream * @param typeInfo the user defined type information for the stream * @return the data stream constructed */ public DataStreamSource addSource( SourceFunction function, TypeInformation typeInfo) { return addSource(function, "Custom Source", typeInfo); } /** * Ads a data source with a custom type information thus opening a {@link DataStream}. Only in * very special cases does the user need to support type information. Otherwise use {@link * #addSource(org.apache.flink.streaming.api.functions.source.SourceFunction)} * * @param function the user defined function * @param sourceName Name of the data source * @param type of the returned stream * @param typeInfo the user defined type information for the stream * @return the data stream constructed */ public DataStreamSource addSource( SourceFunction function, String sourceName, TypeInformation typeInfo) { return addSource(function, sourceName, typeInfo, Boundedness.CONTINUOUS_UNBOUNDED); } private DataStreamSource addSource( final SourceFunction function, final String sourceName, @Nullable final TypeInformation typeInfo, final Boundedness boundedness) { checkNotNull(function); checkNotNull(sourceName); checkNotNull(boundedness); TypeInformation resolvedTypeInfo = getTypeInfo(function, sourceName, SourceFunction.class, typeInfo); boolean isParallel = function instanceof ParallelSourceFunction; clean(function); final StreamSource sourceOperator = new StreamSource<>(function); return new DataStreamSource<>( this, resolvedTypeInfo, sourceOperator, isParallel, sourceName, boundedness); } /** * Adds a data {@link Source} to the environment to get a {@link DataStream}. * *

The result will be either a bounded data stream (that can be processed in a batch way) or * an unbounded data stream (that must be processed in a streaming way), based on the * boundedness property of the source, as defined by {@link Source#getBoundedness()}. * *

The result type (that is used to create serializers for the produced data events) will be * automatically extracted. This is useful for sources that describe the produced types already * in their configuration, to avoid having to declare the type multiple times. For example the * file sources and Kafka sources already define the produced byte their * parsers/serializers/formats, and can forward that information. * * @param source the user defined source * @param sourceName Name of the data source * @param type of the returned stream * @return the data stream constructed */ @Experimental public DataStreamSource fromSource( Source source, WatermarkStrategy timestampsAndWatermarks, String sourceName) { return fromSource(source, timestampsAndWatermarks, sourceName, null); } /** * Adds a data {@link Source} to the environment to get a {@link DataStream}. * *

The result will be either a bounded data stream (that can be processed in a batch way) or * an unbounded data stream (that must be processed in a streaming way), based on the * boundedness property of the source, as defined by {@link Source#getBoundedness()}. * *

This method takes an explicit type information for the produced data stream, so that * callers can define directly what type/serializer will be used for the produced stream. For * sources that describe their produced type, the method {@link #fromSource(Source, * WatermarkStrategy, String)} can be used to avoid specifying the produced type redundantly. * * @param source the user defined source * @param sourceName Name of the data source * @param type of the returned stream * @param typeInfo the user defined type information for the stream * @return the data stream constructed */ @Experimental public DataStreamSource fromSource( Source source, WatermarkStrategy timestampsAndWatermarks, String sourceName, TypeInformation typeInfo) { final TypeInformation resolvedTypeInfo = getTypeInfo(source, sourceName, Source.class, typeInfo); return new DataStreamSource<>( this, checkNotNull(source, "source"), checkNotNull(timestampsAndWatermarks, "timestampsAndWatermarks"), checkNotNull(resolvedTypeInfo), checkNotNull(sourceName)); } /** * Triggers the program execution. The environment will execute all parts of the program that * have resulted in a "sink" operation. Sink operations are for example printing results or * forwarding them to a message queue. * *

The program execution will be logged and displayed with a generated default name. * * @return The result of the job execution, containing elapsed time and accumulators. * @throws Exception which occurs during job execution. */ public JobExecutionResult execute() throws Exception { return execute(getJobName()); } /** * Triggers the program execution. The environment will execute all parts of the program that * have resulted in a "sink" operation. Sink operations are for example printing results or * forwarding them to a message queue. * *

The program execution will be logged and displayed with the provided name * * @param jobName Desired name of the job * @return The result of the job execution, containing elapsed time and accumulators. * @throws Exception which occurs during job execution. */ public JobExecutionResult execute(String jobName) throws Exception { Preconditions.checkNotNull(jobName, "Streaming Job name should not be null."); return execute(getStreamGraph(jobName)); } /** * Triggers the program execution. The environment will execute all parts of the program that * have resulted in a "sink" operation. Sink operations are for example printing results or * forwarding them to a message queue. * * @param streamGraph the stream graph representing the transformations * @return The result of the job execution, containing elapsed time and accumulators. * @throws Exception which occurs during job execution. */ @Internal public JobExecutionResult execute(StreamGraph streamGraph) throws Exception { final JobClient jobClient = executeAsync(streamGraph); try { final JobExecutionResult jobExecutionResult; if (configuration.getBoolean(DeploymentOptions.ATTACHED)) { jobExecutionResult = jobClient.getJobExecutionResult().get(); } else { jobExecutionResult = new DetachedJobExecutionResult(jobClient.getJobID()); } jobListeners.forEach( jobListener -> jobListener.onJobExecuted(jobExecutionResult, null)); return jobExecutionResult; } catch (Throwable t) { // get() on the JobExecutionResult Future will throw an ExecutionException. This // behaviour was largely not there in Flink versions before the PipelineExecutor // refactoring so we should strip that exception. Throwable strippedException = ExceptionUtils.stripExecutionException(t); jobListeners.forEach( jobListener -> { jobListener.onJobExecuted(null, strippedException); }); ExceptionUtils.rethrowException(strippedException); // never reached, only make javac happy return null; } } /** * Register a {@link JobListener} in this environment. The {@link JobListener} will be notified * on specific job status changed. */ @PublicEvolving public void registerJobListener(JobListener jobListener) { checkNotNull(jobListener, "JobListener cannot be null"); jobListeners.add(jobListener); } /** Clear all registered {@link JobListener}s. */ @PublicEvolving public void clearJobListeners() { this.jobListeners.clear(); } /** * Triggers the program asynchronously. The environment will execute all parts of the program * that have resulted in a "sink" operation. Sink operations are for example printing results or * forwarding them to a message queue. * *

The program execution will be logged and displayed with a generated default name. * * @return A {@link JobClient} that can be used to communicate with the submitted job, completed * on submission succeeded. * @throws Exception which occurs during job execution. */ @PublicEvolving public final JobClient executeAsync() throws Exception { return executeAsync(getJobName()); } /** * Triggers the program execution asynchronously. The environment will execute all parts of the * program that have resulted in a "sink" operation. Sink operations are for example printing * results or forwarding them to a message queue. * *

The program execution will be logged and displayed with the provided name * * @param jobName desired name of the job * @return A {@link JobClient} that can be used to communicate with the submitted job, completed * on submission succeeded. * @throws Exception which occurs during job execution. */ @PublicEvolving public JobClient executeAsync(String jobName) throws Exception { return executeAsync(getStreamGraph(checkNotNull(jobName))); } /** * Triggers the program execution asynchronously. The environment will execute all parts of the * program that have resulted in a "sink" operation. Sink operations are for example printing * results or forwarding them to a message queue. * * @param streamGraph the stream graph representing the transformations * @return A {@link JobClient} that can be used to communicate with the submitted job, completed * on submission succeeded. * @throws Exception which occurs during job execution. */ @Internal public JobClient executeAsync(StreamGraph streamGraph) throws Exception { checkNotNull(streamGraph, "StreamGraph cannot be null."); checkNotNull( configuration.get(DeploymentOptions.TARGET), "No execution.target specified in your configuration file."); final PipelineExecutorFactory executorFactory = executorServiceLoader.getExecutorFactory(configuration); checkNotNull( executorFactory, "Cannot find compatible factory for specified execution.target (=%s)", configuration.get(DeploymentOptions.TARGET)); CompletableFuture jobClientFuture = executorFactory .getExecutor(configuration) .execute(streamGraph, configuration, userClassloader); try { JobClient jobClient = jobClientFuture.get(); jobListeners.forEach(jobListener -> jobListener.onJobSubmitted(jobClient, null)); return jobClient; } catch (ExecutionException executionException) { final Throwable strippedException = ExceptionUtils.stripExecutionException(executionException); jobListeners.forEach( jobListener -> jobListener.onJobSubmitted(null, strippedException)); throw new FlinkException( String.format("Failed to execute job '%s'.", streamGraph.getJobName()), strippedException); } } /** * Getter of the {@link org.apache.flink.streaming.api.graph.StreamGraph} of the streaming job. * This call clears previously registered {@link Transformation transformations}. * * @return The streamgraph representing the transformations */ @Internal public StreamGraph getStreamGraph() { return getStreamGraph(getJobName()); } /** * Getter of the {@link org.apache.flink.streaming.api.graph.StreamGraph} of the streaming job. * This call clears previously registered {@link Transformation transformations}. * * @param jobName Desired name of the job * @return The streamgraph representing the transformations */ @Internal public StreamGraph getStreamGraph(String jobName) { return getStreamGraph(jobName, true); } /** * Getter of the {@link org.apache.flink.streaming.api.graph.StreamGraph StreamGraph} of the * streaming job with the option to clear previously registered {@link Transformation * transformations}. Clearing the transformations allows, for example, to not re-execute the * same operations when calling {@link #execute()} multiple times. * * @param jobName Desired name of the job * @param clearTransformations Whether or not to clear previously registered transformations * @return The streamgraph representing the transformations */ @Internal public StreamGraph getStreamGraph(String jobName, boolean clearTransformations) { StreamGraph streamGraph = getStreamGraphGenerator().setJobName(jobName).generate(); if (clearTransformations) { this.transformations.clear(); } return streamGraph; } private StreamGraphGenerator getStreamGraphGenerator() { if (transformations.size() <= 0) { throw new IllegalStateException( "No operators defined in streaming topology. Cannot execute."); } final RuntimeExecutionMode executionMode = configuration.get(ExecutionOptions.RUNTIME_MODE); return new StreamGraphGenerator(transformations, config, checkpointCfg, getConfiguration()) .setRuntimeExecutionMode(executionMode) .setStateBackend(defaultStateBackend) .setSavepointDir(defaultSavepointDirectory) .setChaining(isChainingEnabled) .setUserArtifacts(cacheFile) .setTimeCharacteristic(timeCharacteristic) .setDefaultBufferTimeout(bufferTimeout); } /** * Creates the plan with which the system will execute the program, and returns it as a String * using a JSON representation of the execution data flow graph. Note that this needs to be * called, before the plan is executed. * * @return The execution plan of the program, as a JSON String. */ public String getExecutionPlan() { return getStreamGraph(getJobName(), false).getStreamingPlanAsJSON(); } /** * Returns a "closure-cleaned" version of the given function. Cleans only if closure cleaning is * not disabled in the {@link org.apache.flink.api.common.ExecutionConfig} */ @Internal public F clean(F f) { if (getConfig().isClosureCleanerEnabled()) { ClosureCleaner.clean(f, getConfig().getClosureCleanerLevel(), true); } ClosureCleaner.ensureSerializable(f); return f; } /** * Adds an operator to the list of operators that should be executed when calling {@link * #execute}. * *

When calling {@link #execute()} only the operators that where previously added to the list * are executed. * *

This is not meant to be used by users. The API methods that create operators must call * this method. */ @Internal public void addOperator(Transformation transformation) { Preconditions.checkNotNull(transformation, "transformation must not be null."); this.transformations.add(transformation); } // -------------------------------------------------------------------------------------------- // Factory methods for ExecutionEnvironments // -------------------------------------------------------------------------------------------- /** * Creates an execution environment that represents the context in which the program is * currently executed. If the program is invoked standalone, this method returns a local * execution environment, as returned by {@link #createLocalEnvironment()}. * * @return The execution environment of the context in which the program is executed. */ public static StreamExecutionEnvironment getExecutionEnvironment() { return getExecutionEnvironment(new Configuration()); } /** * Creates an execution environment that represents the context in which the program is * currently executed. If the program is invoked standalone, this method returns a local * execution environment, as returned by {@link #createLocalEnvironment(Configuration)}. * *

When executed from the command line the given configuration is stacked on top of the * global configuration which comes from the {@code flink-conf.yaml}, potentially overriding * duplicated options. * * @param configuration The configuration to instantiate the environment with. * @return The execution environment of the context in which the program is executed. */ public static StreamExecutionEnvironment getExecutionEnvironment(Configuration configuration) { return Utils.resolveFactory(threadLocalContextEnvironmentFactory, contextEnvironmentFactory) .map(factory -> factory.createExecutionEnvironment(configuration)) .orElseGet(() -> StreamExecutionEnvironment.createLocalEnvironment(configuration)); } /** * Creates a {@link LocalStreamEnvironment}. The local execution environment will run the * program in a multi-threaded fashion in the same JVM as the environment was created in. The * default parallelism of the local environment is the number of hardware contexts (CPU cores / * threads), unless it was specified differently by {@link #setParallelism(int)}. * * @return A local execution environment. */ public static LocalStreamEnvironment createLocalEnvironment() { return createLocalEnvironment(defaultLocalParallelism); } /** * Creates a {@link LocalStreamEnvironment}. The local execution environment will run the * program in a multi-threaded fashion in the same JVM as the environment was created in. It * will use the parallelism specified in the parameter. * * @param parallelism The parallelism for the local environment. * @return A local execution environment with the specified parallelism. */ public static LocalStreamEnvironment createLocalEnvironment(int parallelism) { return createLocalEnvironment(parallelism, new Configuration()); } /** * Creates a {@link LocalStreamEnvironment}. The local execution environment will run the * program in a multi-threaded fashion in the same JVM as the environment was created in. It * will use the parallelism specified in the parameter. * * @param parallelism The parallelism for the local environment. * @param configuration Pass a custom configuration into the cluster * @return A local execution environment with the specified parallelism. */ public static LocalStreamEnvironment createLocalEnvironment( int parallelism, Configuration configuration) { Configuration copyOfConfiguration = new Configuration(); copyOfConfiguration.addAll(configuration); copyOfConfiguration.set(CoreOptions.DEFAULT_PARALLELISM, parallelism); return createLocalEnvironment(copyOfConfiguration); } /** * Creates a {@link LocalStreamEnvironment}. The local execution environment will run the * program in a multi-threaded fashion in the same JVM as the environment was created in. * * @param configuration Pass a custom configuration into the cluster * @return A local execution environment with the specified parallelism. */ public static LocalStreamEnvironment createLocalEnvironment(Configuration configuration) { if (configuration.getOptional(CoreOptions.DEFAULT_PARALLELISM).isPresent()) { return new LocalStreamEnvironment(configuration); } else { Configuration copyOfConfiguration = new Configuration(); copyOfConfiguration.addAll(configuration); copyOfConfiguration.set(CoreOptions.DEFAULT_PARALLELISM, defaultLocalParallelism); return new LocalStreamEnvironment(copyOfConfiguration); } } /** * Creates a {@link LocalStreamEnvironment} for local program execution that also starts the web * monitoring UI. * *

The local execution environment will run the program in a multi-threaded fashion in the * same JVM as the environment was created in. It will use the parallelism specified in the * parameter. * *

If the configuration key 'rest.port' was set in the configuration, that particular port * will be used for the web UI. Otherwise, the default port (8081) will be used. */ @PublicEvolving public static StreamExecutionEnvironment createLocalEnvironmentWithWebUI(Configuration conf) { checkNotNull(conf, "conf"); if (!conf.contains(RestOptions.PORT)) { // explicitly set this option so that it's not set to 0 later conf.setInteger(RestOptions.PORT, RestOptions.PORT.defaultValue()); } return createLocalEnvironment(conf); } /** * Creates a {@link RemoteStreamEnvironment}. The remote environment sends (parts of) the * program to a cluster for execution. Note that all file paths used in the program must be * accessible from the cluster. The execution will use no parallelism, unless the parallelism is * set explicitly via {@link #setParallelism}. * * @param host The host name or address of the master (JobManager), where the program should be * executed. * @param port The port of the master (JobManager), where the program should be executed. * @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the * program uses user-defined functions, user-defined input formats, or any libraries, those * must be provided in the JAR files. * @return A remote environment that executes the program on a cluster. */ public static StreamExecutionEnvironment createRemoteEnvironment( String host, int port, String... jarFiles) { return new RemoteStreamEnvironment(host, port, jarFiles); } /** * Creates a {@link RemoteStreamEnvironment}. The remote environment sends (parts of) the * program to a cluster for execution. Note that all file paths used in the program must be * accessible from the cluster. The execution will use the specified parallelism. * * @param host The host name or address of the master (JobManager), where the program should be * executed. * @param port The port of the master (JobManager), where the program should be executed. * @param parallelism The parallelism to use during the execution. * @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the * program uses user-defined functions, user-defined input formats, or any libraries, those * must be provided in the JAR files. * @return A remote environment that executes the program on a cluster. */ public static StreamExecutionEnvironment createRemoteEnvironment( String host, int port, int parallelism, String... jarFiles) { RemoteStreamEnvironment env = new RemoteStreamEnvironment(host, port, jarFiles); env.setParallelism(parallelism); return env; } /** * Creates a {@link RemoteStreamEnvironment}. The remote environment sends (parts of) the * program to a cluster for execution. Note that all file paths used in the program must be * accessible from the cluster. The execution will use the specified parallelism. * * @param host The host name or address of the master (JobManager), where the program should be * executed. * @param port The port of the master (JobManager), where the program should be executed. * @param clientConfig The configuration used by the client that connects to the remote cluster. * @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the * program uses user-defined functions, user-defined input formats, or any libraries, those * must be provided in the JAR files. * @return A remote environment that executes the program on a cluster. */ public static StreamExecutionEnvironment createRemoteEnvironment( String host, int port, Configuration clientConfig, String... jarFiles) { return new RemoteStreamEnvironment(host, port, clientConfig, jarFiles); } /** * Gets the default parallelism that will be used for the local execution environment created by * {@link #createLocalEnvironment()}. * * @return The default local parallelism */ @PublicEvolving public static int getDefaultLocalParallelism() { return defaultLocalParallelism; } /** * Sets the default parallelism that will be used for the local execution environment created by * {@link #createLocalEnvironment()}. * * @param parallelism The parallelism to use as the default local parallelism. */ @PublicEvolving public static void setDefaultLocalParallelism(int parallelism) { defaultLocalParallelism = parallelism; } // -------------------------------------------------------------------------------------------- // Methods to control the context and local environments for execution from packaged programs // -------------------------------------------------------------------------------------------- protected static void initializeContextEnvironment(StreamExecutionEnvironmentFactory ctx) { contextEnvironmentFactory = ctx; threadLocalContextEnvironmentFactory.set(contextEnvironmentFactory); } protected static void resetContextEnvironment() { contextEnvironmentFactory = null; threadLocalContextEnvironmentFactory.remove(); } /** * Registers a file at the distributed cache under the given name. The file will be accessible * from any user-defined function in the (distributed) runtime under a local path. Files may be * local files (which will be distributed via BlobServer), or files in a distributed file * system. The runtime will copy the files temporarily to a local cache, if needed. * *

The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside * UDFs via {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and * provides access {@link org.apache.flink.api.common.cache.DistributedCache} via {@link * org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}. * * @param filePath The path of the file, as a URI (e.g. "file:///some/path" or * "hdfs://host:port/and/path") * @param name The name under which the file is registered. */ public void registerCachedFile(String filePath, String name) { registerCachedFile(filePath, name, false); } /** * Registers a file at the distributed cache under the given name. The file will be accessible * from any user-defined function in the (distributed) runtime under a local path. Files may be * local files (which will be distributed via BlobServer), or files in a distributed file * system. The runtime will copy the files temporarily to a local cache, if needed. * *

The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside * UDFs via {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and * provides access {@link org.apache.flink.api.common.cache.DistributedCache} via {@link * org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}. * * @param filePath The path of the file, as a URI (e.g. "file:///some/path" or * "hdfs://host:port/and/path") * @param name The name under which the file is registered. * @param executable flag indicating whether the file should be executable */ public void registerCachedFile(String filePath, String name, boolean executable) { this.cacheFile.add( new Tuple2<>( name, new DistributedCache.DistributedCacheEntry(filePath, executable))); } // Private helpers. @SuppressWarnings("unchecked") private > T getTypeInfo( Object source, String sourceName, Class baseSourceClass, TypeInformation typeInfo) { TypeInformation resolvedTypeInfo = typeInfo; if (resolvedTypeInfo == null && source instanceof ResultTypeQueryable) { resolvedTypeInfo = ((ResultTypeQueryable) source).getProducedType(); } if (resolvedTypeInfo == null) { try { resolvedTypeInfo = TypeExtractor.createTypeInfo( baseSourceClass, source.getClass(), 0, null, null); } catch (final InvalidTypesException e) { resolvedTypeInfo = (TypeInformation) new MissingTypeInfo(sourceName, e); } } return (T) resolvedTypeInfo; } private String getJobName() { return configuration.getString(PipelineOptions.NAME, DEFAULT_JOB_NAME); } } ================================================ FILE: fire-shell/flink-shell/src/main/scala/com/zto/fire/shell/flink/FireILoop.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.shell.flink import com.zto.fire.common.conf.FireFrameworkConf import com.zto.fire.common.util.{FireUtils, PropUtils} import org.apache.flink.api.java.{JarHelper, ScalaShellEnvironment, ScalaShellStreamEnvironment} import org.apache.flink.configuration.Configuration import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment import org.apache.flink.util.AbstractID import java.io.{BufferedReader, File, FileOutputStream} import scala.tools.nsc.interpreter._ class FireILoop( val flinkConfig: Configuration, val externalJars: Option[Array[String]], in0: Option[BufferedReader], out0: JPrintWriter) extends ILoop(in0, out0) { def this( flinkConfig: Configuration, externalJars: Option[Array[String]], in0: BufferedReader, out: JPrintWriter) { this(flinkConfig, externalJars, Some(in0), out) } def this( flinkConfig: Configuration, externalJars: Option[Array[String]]) { this(flinkConfig, externalJars, None, new JPrintWriter(Console.out, true)) } def this( flinkConfig: Configuration, in0: BufferedReader, out: JPrintWriter){ this(flinkConfig, None, in0, out) } // remote environment private lazy val (remoteBenv: ScalaShellEnvironment, remoteSenv: ScalaShellStreamEnvironment) = { // allow creation of environments ScalaShellEnvironment.resetContextEnvironments() ScalaShellStreamEnvironment.resetContextEnvironments() // create our environment that submits against the cluster (local or remote) val remoteBenv = new ScalaShellEnvironment( flinkConfig, this, this.getExternalJars(): _*) val remoteSenv = new ScalaShellStreamEnvironment( flinkConfig, this, getExternalJars(): _*) // prevent further instantiation of environments ScalaShellEnvironment.disableAllContextAndOtherEnvironments() ScalaShellStreamEnvironment.disableAllContextAndOtherEnvironments() (remoteBenv,remoteSenv) } // local environment val ( env: StreamExecutionEnvironment, table: StreamTableEnvironment ) = { PropUtils.setProperty(FireFrameworkConf.FIRE_ENV_LOCAL, "true") Test.main(null) val scalaSenv = Test.getFire val scalaSTEnv = Test.getStreamTableEnv (scalaSenv, scalaSTEnv) } /** * creates a temporary directory to store compiled console files */ private val tmpDirBase: File = { // get unique temporary folder: val abstractID: String = new AbstractID().toString val tmpDir: File = new File( System.getProperty("java.io.tmpdir"), "scala_shell_tmp-" + abstractID) if (!tmpDir.exists) { tmpDir.mkdir } tmpDir } // scala_shell commands private val tmpDirShell: File = { new File(tmpDirBase, "scala_shell_commands") } // scala shell jar file name private val tmpJarShell: File = { new File(tmpDirBase, "scala_shell_commands.jar") } private val packageImports = Seq[String]( "org.apache.flink.core.fs._", "org.apache.flink.core.fs.local._", "org.apache.flink.api.common.io._", "org.apache.flink.api.common.aggregators._", "org.apache.flink.api.common.accumulators._", "org.apache.flink.api.common.distributions._", "org.apache.flink.api.common.operators._", "org.apache.flink.api.common.operators.base.JoinOperatorBase.JoinHint", "org.apache.flink.api.common.functions._", "org.apache.flink.api.java.io._", "org.apache.flink.api.java.aggregation._", "org.apache.flink.api.java.functions._", "org.apache.flink.api.java.operators._", "org.apache.flink.api.java.sampling._", "org.apache.flink.api.scala._", "org.apache.flink.api.scala.utils._", "org.apache.flink.streaming.api.scala._", "org.apache.flink.streaming.api.windowing.time._", "org.apache.flink.table.api._", "org.apache.flink.table.api.bridge.scala._", "org.apache.flink.table.connector.ChangelogMode", "org.apache.flink.table.functions._", "org.apache.flink.types.Row", "com.zto.fire._", "com.zto.fire.shell.flink" ) override def createInterpreter(): Unit = { super.createInterpreter() intp.beQuietDuring { // import dependencies intp.interpret("import " + packageImports.mkString(", ")) // set execution environments intp.bind("senv", this.env) intp.bind("fire", this.env) intp.bind("env", this.env) intp.bind("table", this.table) } } /** * Packages the compiled classes of the current shell session into a Jar file for execution * on a Flink cluster. * * @return The path of the created Jar file */ def writeFilesToDisk(): File = { val vd = intp.virtualDirectory val vdIt = vd.iterator for (fi <- vdIt) { if (fi.isDirectory) { val fiIt = fi.iterator for (f <- fiIt) { // directory for compiled line val lineDir = new File(tmpDirShell.getAbsolutePath, fi.name) lineDir.mkdirs() // compiled classes for commands from shell val writeFile = new File(lineDir.getAbsolutePath, f.name) val outputStream = new FileOutputStream(writeFile) val inputStream = f.input // copy file contents org.apache.commons.io.IOUtils.copy(inputStream, outputStream) inputStream.close() outputStream.close() } } } val compiledClasses = new File(tmpDirShell.getAbsolutePath) val jarFilePath = new File(tmpJarShell.getAbsolutePath) val jh: JarHelper = new JarHelper jh.jarDir(compiledClasses, jarFilePath) jarFilePath } /** * custom welcome message */ override def printWelcome() { FireUtils.isSplash = false FireUtils.splash } def getExternalJars(): Array[String] = externalJars.getOrElse(Array.empty[String]) } ================================================ FILE: fire-shell/flink-shell/src/main/scala/com/zto/fire/shell/flink/Test.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.shell.flink import com.zto.fire.common.anno.Config import com.zto.fire.flink.FlinkStreaming import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment /** * 基于Fire进行Flink Streaming开发 */ @Config( """ |# 直接从配置文件中拷贝过来即可 | #注释信息 |kafka.brokers.name = bigdata_test |kafka.topics = fire |kafka.group.id=fire |fire.acc.timer.max.size=30 |fire.acc.log.max.size=20 |flink.stream.checkpoint.interval=60000 |flink.state.choose.disk.policy=round_robin |fire.analysis.arthas.enable=false |fire.log.level.conf.org.apache.flink=warn |fire.analysis.arthas.container.enable=false |fire.rest.filter.enable=true |""") object Test extends FlinkStreaming { def getFire: StreamExecutionEnvironment = this.fire def getStreamTableEnv: StreamTableEnvironment = this.steamTableEnv } ================================================ FILE: fire-shell/flink-shell/src/main/scala/org/apache/flink/api/scala/FlinkShell.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.api.scala import com.zto.fire.shell.flink.FireILoop import org.apache.flink.annotation.Internal import org.apache.flink.client.cli.{CliFrontend, CliFrontendParser} import org.apache.flink.client.deployment.DefaultClusterClientServiceLoader import org.apache.flink.client.deployment.executors.RemoteExecutor import org.apache.flink.client.program.{ClusterClient, MiniClusterClient} import org.apache.flink.configuration._ import org.apache.flink.runtime.minicluster.{MiniCluster, MiniClusterConfiguration} import java.io._ import scala.collection.mutable.ArrayBuffer import scala.tools.nsc.Settings import scala.tools.nsc.interpreter._ object FlinkShell { object ExecutionMode extends Enumeration { val UNDEFINED, LOCAL, REMOTE, YARN = Value } /** Configuration object */ case class Config( host: Option[String] = None, port: Option[Int] = None, externalJars: Option[Array[String]] = None, executionMode: ExecutionMode.Value = ExecutionMode.UNDEFINED, yarnConfig: Option[YarnConfig] = None, configDir: Option[String] = None ) /** YARN configuration object */ case class YarnConfig( jobManagerMemory: Option[String] = None, name: Option[String] = None, queue: Option[String] = None, slots: Option[Int] = None, taskManagerMemory: Option[String] = None ) /** Buffered reader to substitute input in test */ var bufferedReader: Option[BufferedReader] = None def main(args: Array[String]) { val parser = new scopt.OptionParser[Config]("start-scala-shell.sh") { head("Flink Scala Shell") cmd("local") action { (_, c) => c.copy(executionMode = ExecutionMode.LOCAL) } text "Starts Flink scala shell with a local Flink cluster" children( opt[(String)] ("addclasspath") abbr("a") valueName("") action { case (x, c) => val xArray = x.split(":") c.copy(externalJars = Option(xArray)) } text "Specifies additional jars to be used in Flink" ) cmd("remote") action { (_, c) => c.copy(executionMode = ExecutionMode.REMOTE) } text "Starts Flink scala shell connecting to a remote cluster" children( arg[String]("") action { (h, c) => c.copy(host = Some(h)) } text "Remote host name as string", arg[Int]("") action { (p, c) => c.copy(port = Some(p)) } text "Remote port as integer\n", opt[String]("addclasspath") abbr("a") valueName("") action { case (x, c) => val xArray = x.split(":") c.copy(externalJars = Option(xArray)) } text "Specifies additional jars to be used in Flink" ) cmd("yarn") action { (_, c) => c.copy(executionMode = ExecutionMode.YARN, yarnConfig = None) } text "Starts Flink scala shell connecting to a yarn cluster" children( opt[String]("jobManagerMemory") abbr ("jm") valueName ("arg") action { (x, c) => c.copy(yarnConfig = Some(ensureYarnConfig(c).copy(jobManagerMemory = Some(x)))) } text "Memory for JobManager container", opt[String]("name") abbr ("nm") action { (x, c) => c.copy(yarnConfig = Some(ensureYarnConfig(c).copy(name = Some(x)))) } text "Set a custom name for the application on YARN", opt[String]("queue") abbr ("qu") valueName ("") action { (x, c) => c.copy(yarnConfig = Some(ensureYarnConfig(c).copy(queue = Some(x)))) } text "Specifies YARN queue", opt[Int]("slots") abbr ("s") valueName ("") action { (x, c) => c.copy(yarnConfig = Some(ensureYarnConfig(c).copy(slots = Some(x)))) } text "Number of slots per TaskManager", opt[String]("taskManagerMemory") abbr ("tm") valueName ("") action { (x, c) => c.copy(yarnConfig = Some(ensureYarnConfig(c).copy(taskManagerMemory = Some(x)))) } text "Memory per TaskManager container", opt[(String)] ("addclasspath") abbr("a") valueName("") action { case (x, c) => val xArray = x.split(":") c.copy(externalJars = Option(xArray)) } text "Specifies additional jars to be used in Flink" ) opt[String]("configDir").optional().action { (arg, conf) => conf.copy(configDir = Option(arg)) } text { "The configuration directory." } help("help") abbr ("h") text "Prints this usage text" } // parse arguments parser.parse(args, Config()) match { case Some(config) => startShell(config) case _ => println("Could not parse program arguments") } } @Internal def ensureYarnConfig(config: Config) = config.yarnConfig match { case Some(yarnConfig) => yarnConfig case None => YarnConfig() } private def getConfigDir(config: Config) = { config.configDir.getOrElse(CliFrontend.getConfigurationDirectoryFromEnv) } private def getGlobalConfig(config: Config) = { val confDirPath = getConfigDir(config) val configDirectory = new File(confDirPath) GlobalConfiguration.loadConfiguration(configDirectory.getAbsolutePath) } def startShell(config: Config): Unit = { println("Starting Flink Shell:") val flinkConfig = getGlobalConfig(config) val (repl, clusterClient) = try { val (effectiveConfig, clusterClient) = fetchConnectionInfo(config, flinkConfig) val host = effectiveConfig.getString(JobManagerOptions.ADDRESS) val port = effectiveConfig.getInteger(JobManagerOptions.PORT) println(s"\nConnecting to Flink cluster (host: $host, port: $port).\n") val repl = bufferedReader match { case Some(reader) => val out = new StringWriter() new FireILoop(effectiveConfig, config.externalJars, reader, new JPrintWriter(out)) case None => new FireILoop(effectiveConfig, config.externalJars) } (repl, clusterClient) } catch { case e: IllegalArgumentException => println(s"Error: ${e.getMessage}") sys.exit() } val settings = new Settings() settings.usejavacp.value = true settings.Yreplsync.value = true try { repl.process(settings) } finally { repl.closeInterpreter() clusterClient match { case Some(clusterClient) => clusterClient.shutDownCluster() clusterClient.close() case _ => } } println(" good bye ..") } @Internal def fetchConnectionInfo( config: Config, flinkConfig: Configuration): (Configuration, Option[ClusterClient[_]]) = { config.executionMode match { case ExecutionMode.LOCAL => createLocalClusterAndConfig(flinkConfig) case ExecutionMode.REMOTE => createRemoteConfig(config, flinkConfig) case ExecutionMode.YARN => createYarnClusterIfNeededAndGetConfig(config, flinkConfig) case ExecutionMode.UNDEFINED => // Wrong input throw new IllegalArgumentException("please specify execution mode:\n" + "[local | remote | yarn]") } } private def createYarnClusterIfNeededAndGetConfig(config: Config, flinkConfig: Configuration) = { flinkConfig.setBoolean(DeploymentOptions.ATTACHED, true) val (clusterConfig, clusterClient) = config.yarnConfig match { case Some(_) => deployNewYarnCluster(config, flinkConfig) case None => (flinkConfig, None) } val (effectiveConfig, _) = clusterClient match { case Some(_) => fetchDeployedYarnClusterInfo(config, clusterConfig, "yarn-cluster") case None => fetchDeployedYarnClusterInfo(config, clusterConfig, "default") } println("Configuration: " + effectiveConfig) (effectiveConfig, clusterClient) } private def deployNewYarnCluster(config: Config, flinkConfig: Configuration) = { val effectiveConfig = new Configuration(flinkConfig) val args = parseArgList(config, "yarn-cluster") val configurationDirectory = getConfigDir(config) val frontend = new CliFrontend( effectiveConfig, CliFrontend.loadCustomCommandLines(effectiveConfig, configurationDirectory)) val commandOptions = CliFrontendParser.getRunCommandOptions val commandLineOptions = CliFrontendParser.mergeOptions(commandOptions, frontend.getCustomCommandLineOptions) val commandLine = CliFrontendParser.parse(commandLineOptions, args, true) val customCLI = frontend.validateAndGetActiveCommandLine(commandLine) effectiveConfig.addAll(customCLI.toConfiguration(commandLine)) val serviceLoader = new DefaultClusterClientServiceLoader val clientFactory = serviceLoader.getClusterClientFactory(effectiveConfig) val clusterDescriptor = clientFactory.createClusterDescriptor(effectiveConfig) val clusterSpecification = clientFactory.getClusterSpecification(effectiveConfig) val clusterClient = try { clusterDescriptor .deploySessionCluster(clusterSpecification) .getClusterClient } finally { effectiveConfig.set(DeploymentOptions.TARGET, "yarn-session") clusterDescriptor.close() } (effectiveConfig, Some(clusterClient)) } private def fetchDeployedYarnClusterInfo( config: Config, flinkConfig: Configuration, mode: String) = { val effectiveConfig = new Configuration(flinkConfig) val args = parseArgList(config, mode) val configurationDirectory = getConfigDir(config) val frontend = new CliFrontend( effectiveConfig, CliFrontend.loadCustomCommandLines(effectiveConfig, configurationDirectory)) val commandOptions = CliFrontendParser.getRunCommandOptions val commandLineOptions = CliFrontendParser.mergeOptions(commandOptions, frontend.getCustomCommandLineOptions) val commandLine = CliFrontendParser.parse(commandLineOptions, args, true) val customCLI = frontend.validateAndGetActiveCommandLine(commandLine) effectiveConfig.addAll(customCLI.toConfiguration(commandLine)) (effectiveConfig, None) } def parseArgList(config: Config, mode: String): Array[String] = { val args = if (mode == "default") { ArrayBuffer[String]() } else { ArrayBuffer[String]("-m", mode) } config.yarnConfig match { case Some(yarnConfig) => yarnConfig.jobManagerMemory.foreach((jmMem) => args ++= Seq("-yjm", jmMem.toString)) yarnConfig.taskManagerMemory.foreach((tmMem) => args ++= Seq("-ytm", tmMem.toString)) yarnConfig.name.foreach((name) => args ++= Seq("-ynm", name.toString)) yarnConfig.queue.foreach((queue) => args ++= Seq("-yqu", queue.toString)) yarnConfig.slots.foreach((slots) => args ++= Seq("-ys", slots.toString)) args.toArray case None => args.toArray } } private def createRemoteConfig( config: Config, flinkConfig: Configuration): (Configuration, None.type) = { if (config.host.isEmpty || config.port.isEmpty) { throw new IllegalArgumentException(" or is not specified!") } val effectiveConfig = new Configuration(flinkConfig) setJobManagerInfoToConfig(effectiveConfig, config.host.get, config.port.get) effectiveConfig.set(DeploymentOptions.TARGET, RemoteExecutor.NAME) effectiveConfig.setBoolean(DeploymentOptions.ATTACHED, true) (effectiveConfig, None) } private def createLocalClusterAndConfig(flinkConfig: Configuration) = { val config = new Configuration(flinkConfig) config.setInteger(JobManagerOptions.PORT, 0) val cluster = createLocalCluster(config) val port = cluster.getRestAddress.get.getPort setJobManagerInfoToConfig(config, "localhost", port) config.set(DeploymentOptions.TARGET, RemoteExecutor.NAME) config.setBoolean(DeploymentOptions.ATTACHED, true) println(s"\nStarting local Flink cluster (host: localhost, port: ${port}).\n") val clusterClient = new MiniClusterClient(config, cluster) (config, Some(clusterClient)) } private def createLocalCluster(flinkConfig: Configuration) = { val numTaskManagers = flinkConfig.getInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, ConfigConstants.DEFAULT_LOCAL_NUMBER_TASK_MANAGER) val numSlotsPerTaskManager = flinkConfig.getInteger(TaskManagerOptions.NUM_TASK_SLOTS) val miniClusterConfig = new MiniClusterConfiguration.Builder() .setConfiguration(flinkConfig) .setNumSlotsPerTaskManager(numSlotsPerTaskManager) .setNumTaskManagers(numTaskManagers) .build() val cluster = new MiniCluster(miniClusterConfig) cluster.start() cluster } private def setJobManagerInfoToConfig( config: Configuration, host: String, port: Integer): Unit = { config.setString(JobManagerOptions.ADDRESS, host) config.setInteger(JobManagerOptions.PORT, port) config.setString(RestOptions.ADDRESS, host) config.setInteger(RestOptions.PORT, port) } } ================================================ FILE: fire-shell/pom.xml ================================================ 4.0.0 fire-shell pom Fire : Shell : spark-shell flink-shell fire-parent com.zto.fire 2.3.2-SNAPSHOT com.zto.fire fire-common_${scala.binary.version} ${fire.version} ${maven.scope} com.zto.fire fire-core_${scala.binary.version} ${fire.version} ${maven.scope} com.zto.fire fire-connector-jdbc_${scala.binary.version} ${fire.version} ${maven.scope} com.zto.fire fire-connector-spark-hbase_${spark.reference} ${fire.version} org.apache.maven.plugins maven-compiler-plugin 8 8 src/main/resources true ================================================ FILE: fire-shell/spark-shell/pom.xml ================================================ 4.0.0 spark-shell_${spark.reference} Fire : Shell : Spark fire-shell com.zto.fire 2.3.2-SNAPSHOT jline jline 2.14.6 com.zto.fire fire-common_${scala.binary.version} ${fire.version} com.zto.fire fire-spark_${spark.reference} ${fire.version} com.fasterxml.jackson.core jackson-databind 2.10.0 com.fasterxml.jackson.core jackson-core 2.10.0 org.apache.spark spark-core_${scala.binary.version} com.esotericsoftware.kryo kryo ${spark.version} org.apache.spark spark-sql_${scala.binary.version} ${spark.version} org.apache.spark spark-streaming_${scala.binary.version} ${spark.version} org.apache.spark spark-sql-kafka-0-10_${scala.binary.version} ${spark.version} org.apache.spark spark-streaming_${scala.binary.version} ${spark.version} org.apache.spark spark-streaming-kafka-0-10_${scala.binary.version} ${spark.version} org.slf4j jul-to-slf4j 1.7.30 org.apache.spark spark-tags_${scala.binary.version} ${spark.version} org.apache.spark spark-tags_${scala.binary.version} test-jar test ${spark.version} org.apache.xbean xbean-asm7-shaded 4.16 org.eclipse.jetty jetty-server 9.4.34.v20201102 org.eclipse.jetty jetty-plus 9.4.34.v20201102 org.eclipse.jetty jetty-util 9.4.34.v20201102 org.eclipse.jetty jetty-http 9.4.34.v20201102 org.apache.hadoop hadoop-common ${hadoop.version} org.apache.hadoop hadoop-hdfs ${hadoop.version} org.apache.hadoop hadoop-client ${hadoop.version} org.apache.hbase hbase-common ${hbase.version} org.apache.hbase hbase-client org.apache.hbase hbase-server ${hbase.version} org.apache.hbase hbase-client org.apache.hbase hbase-client ${hbase.version} org.apache.commons commons-lang3 3.5 commons-io commons-io 2.4 log4j log4j 1.2.17 org.apache.rocketmq rocketmq-client ${rocketmq.version} org.apache.hudi hudi-spark-bundle_${scala.binary.version} 0.7.0 ${maven.scope} ru.yandex.clickhouse clickhouse-jdbc 0.2.4 ${maven.scope} com.google.guava guava ${guava.version} ================================================ FILE: fire-shell/spark-shell/src/main/scala-spark-3.0/com/zto/fire/shell/spark/FireILoop.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.shell.spark import com.zto.fire.common.util.FireUtils import java.io.BufferedReader // scalastyle:off println import scala.Predef.{println => _, _} // scalastyle:on println import scala.concurrent.Future import scala.reflect.classTag import scala.reflect.io.File import scala.tools.nsc.{GenericRunnerSettings, Properties} import scala.tools.nsc.Settings import scala.tools.nsc.interpreter.{isReplDebug, isReplPower, replProps} import scala.tools.nsc.interpreter.{AbstractOrMissingHandler, ILoop, IMain, JPrintWriter} import scala.tools.nsc.interpreter.{NamedParam, SimpleReader, SplashLoop, SplashReader} import scala.tools.nsc.interpreter.StdReplTags.tagOfIMain import scala.tools.nsc.util.stringFromStream import scala.util.Properties.{javaVersion, javaVmName, versionString} /** * A Spark-specific interactive shell. * intp.interpret("println(\"hello world\")") */ class FireILoop(in0: Option[BufferedReader], out: JPrintWriter) extends ILoop(in0, out) { def this(in0: BufferedReader, out: JPrintWriter) = this(Some(in0), out) def this() = this(None, new JPrintWriter(Console.out, true)) val initializationCommands: Seq[String] = Seq( "import org.apache.spark.SparkContext._", "import org.apache.spark.sql.functions._", "import com.zto.fire.spark.SparkCore", "import com.zto.fire._", "import com.zto.fire.spark.util.SparkSingletonFactory", "import com.zto.fire.shell.spark.Test", """ Test.main(null) @transient val spark = Test.getSparkSession @transient val fire = Test.getFire @transient val sc = Test.getSc """ ) def initializeSpark(): Unit = { if (!intp.reporter.hasErrors) { // `savingReplayStack` removes the commands from session history. savingReplayStack { initializationCommands.foreach(intp quietRun _) } } else { throw new RuntimeException(s"Scala $versionString interpreter encountered " + "errors during initialization") } } /** Print a welcome message */ override def printWelcome(): Unit = { FireUtils.isSplash = false FireUtils.splash echo("Type in expressions to have them evaluated.") echo("Type :help for more information.") } /** Available commands */ override def commands: List[LoopCommand] = standardCommands override def resetCommand(line: String): Unit = { super.resetCommand(line) initializeSpark() echo("Note that after :reset, state of SparkSession and SparkContext is unchanged.") } override def replay(): Unit = { initializeSpark() super.replay() } /** * The following code is mostly a copy of `process` implementation in `ILoop.scala` in Scala * * In newer version of Scala, `printWelcome` is the first thing to be called. As a result, * SparkUI URL information would be always shown after the welcome message. * * However, this is inconsistent compared with the existing version of Spark which will always * show SparkUI URL first. * * The only way we can make it consistent will be duplicating the Scala code. * * We should remove this duplication once Scala provides a way to load our custom initialization * code, and also customize the ordering of printing welcome message. */ override def process(settings: Settings): Boolean = { def newReader = in0.fold(chooseReader(settings))(r => SimpleReader(r, out, interactive = true)) /** Reader to use before interpreter is online. */ def preLoop = { val sr = SplashReader(newReader) { r => in = r in.postInit() } in = sr SplashLoop(sr, prompt) } /* Actions to cram in parallel while collecting first user input at prompt. * Run with output muted both from ILoop and from the intp reporter. */ def loopPostInit(): Unit = mumly { // Bind intp somewhere out of the regular namespace where // we can get at it in generated code. intp.quietBind(NamedParam[IMain]("$intp", intp)(tagOfIMain, classTag[IMain])) // Auto-run code via some setting. ( replProps.replAutorunCode.option flatMap (f => File(f).safeSlurp()) foreach (intp quietRun _) ) // power mode setup if (isReplPower) enablePowerMode(true) initializeSpark() loadInitFiles() // SI-7418 Now, and only now, can we enable TAB completion. in.postInit() } def loadInitFiles(): Unit = settings match { case settings: GenericRunnerSettings => for (f <- settings.loadfiles.value) { loadCommand(f) addReplay(s":load $f") } for (f <- settings.pastefiles.value) { pasteCommand(f) addReplay(s":paste $f") } case _ => } // wait until after startup to enable noisy settings def withSuppressedSettings[A](body: => A): A = { val ss = this.settings import ss._ val noisy = List(Xprint, Ytyperdebug) val noisesome = noisy.exists(!_.isDefault) val current = (Xprint.value, Ytyperdebug.value) if (isReplDebug || !noisesome) body else { this.settings.Xprint.value = List.empty this.settings.Ytyperdebug.value = false try body finally { Xprint.value = current._1 Ytyperdebug.value = current._2 intp.global.printTypings = current._2 } } } def startup(): String = withSuppressedSettings { // let them start typing val splash = preLoop // while we go fire up the REPL try { // don't allow ancient sbt to hijack the reader savingReader { createInterpreter() } intp.initializeSynchronous() val field = classOf[scala.tools.nsc.interpreter.ILoop].getDeclaredFields.filter(_.getName.contains("globalFuture")).head field.setAccessible(true) field.set(this, Future successful true) if (intp.reporter.hasErrors) { echo("Interpreter encountered errors during initialization!") null } else { loopPostInit() printWelcome() splash.start() val line = splash.line // what they typed in while they were waiting if (line == null) { // they ^D try out print Properties.shellInterruptedString finally closeInterpreter() } line } } finally splash.stop() } this.settings = settings startup() match { case null => false case line => try loop(line) match { case LineResults.EOF => out print Properties.shellInterruptedString case _ => } catch AbstractOrMissingHandler() finally closeInterpreter() true } } } ================================================ FILE: fire-shell/spark-shell/src/main/scala-spark-3.0/com/zto/fire/shell/spark/Main.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.zto.fire.shell.spark import java.io.File import java.net.URI import scala.tools.nsc.GenericRunnerSettings import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.repl.Signaling import org.apache.spark.sql.SparkSession /** * java -Dscala.usejavacp=true -cp /Users/insight/project/workspace/fire/fire-repl/spark-repl/target/zto-spark-repl_2.12-2.2.0-SNAPSHOT.jar com.zto.fire.repl.spark.Main */ object Main extends Logging { System.setProperty("scala.usejavacp", "true") initializeLogIfNecessary(true) Signaling.cancelOnInterrupt() val utils = Class.forName("org.apache.spark.util.Utils") val conf = new SparkConf() val rootDir = conf.getOption("spark.repl.classdir").getOrElse(utils.getMethod("getLocalDir", classOf[SparkConf]).invoke(null, conf).asInstanceOf[String]) val outputDir = utils.getMethod("createTempDir", classOf[String], classOf[String]).invoke(null, rootDir, "repl").asInstanceOf[File] var sparkContext: SparkContext = _ var sparkSession: SparkSession = _ // this is a public var because tests reset it. var interp: FireILoop = _ private var hasErrors = false private var isShellSession = false private def scalaOptionError(msg: String): Unit = { hasErrors = true // scalastyle:off println Console.err.println(msg) // scalastyle:on println } def main(args: Array[String]): Unit = { isShellSession = true doMain(args, new FireILoop) } // Visible for testing private[shell] def doMain(args: Array[String], _interp: FireILoop): Unit = { interp = _interp val jars = utils.getMethod("getLocalUserJarsForShell", classOf[SparkConf]).invoke(null, conf).asInstanceOf[Seq[String]] // Remove file:///, file:// or file:/ scheme if exists for each jar .map { x => if (x.startsWith("file:")) new File(new URI(x)).getPath else x } .mkString(File.pathSeparator) val interpArguments = List( "-Yrepl-class-based", "-Yrepl-outdir", s"${outputDir.getAbsolutePath}", "-classpath", jars ) ++ args.toList val settings = new GenericRunnerSettings(scalaOptionError) settings.processArguments(interpArguments, true) if (!hasErrors) { interp.process(settings) // Repl starts and goes in loop of R.E.P.L Option(sparkContext).foreach(_.stop) } } } ================================================ FILE: fire-shell/spark-shell/src/main/scala-spark-3.0/com/zto/fire/shell/spark/Test.scala ================================================ package com.zto.fire.shell.spark import com.zto.fire.common.anno.Config import com.zto.fire.spark.SparkStreaming import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession @Config( """ |hive.cluster=test |kafka.brokers.name = bigdata_test |kafka.topics = fire |kafka.group.id=fire |spark.streaming.stopGracefullyOnShutdown=false |""") object Test extends SparkStreaming { def getFire: SparkSession = this.fire def getSparkSession: SparkSession = this.fire def getSc: SparkContext = this.sc } ================================================ FILE: fire-shell/spark-shell/src/main/scala-spark-3.0/org/apache/spark/repl/ExecutorClassLoader.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.repl import java.io.{ByteArrayOutputStream, FileNotFoundException, FilterInputStream, InputStream} import java.net.{URI, URL, URLEncoder} import java.nio.channels.Channels import java.nio.charset.StandardCharsets.UTF_8 import scala.util.control.NonFatal import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.xbean.asm7._ import org.apache.xbean.asm7.Opcodes._ import org.apache.spark.{SparkConf, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.util.ParentClassLoader /** * A ClassLoader that reads classes from a Hadoop FileSystem or Spark RPC endpoint, used to load * classes defined by the interpreter when the REPL is used. Allows the user to specify if user * class path should be first. * This class loader delegates getting/finding resources to parent loader, which makes sense because * the REPL never produce resources dynamically. One exception is when getting a Class file as * resource stream, in which case we will try to fetch the Class file in the same way as loading * the class, so that dynamically generated Classes from the REPL can be picked up. * * Note: [[ClassLoader]] will preferentially load class from parent. Only when parent is null or * the load failed, that it will call the overridden `findClass` function. To avoid the potential * issue caused by loading class using inappropriate class loader, we should set the parent of * ClassLoader to null, so that we can fully control which class loader is used. For detailed * discussion, see SPARK-18646. */ class ExecutorClassLoader( conf: SparkConf, env: SparkEnv, classUri: String, parent: ClassLoader, userClassPathFirst: Boolean) extends ClassLoader(null) with Logging { val uri = new URI(classUri) val directory = uri.getPath val parentLoader = new ParentClassLoader(parent) // Allows HTTP connect and read timeouts to be controlled for testing / debugging purposes private[repl] var httpUrlConnectionTimeoutMillis: Int = -1 private val fetchFn: (String) => InputStream = uri.getScheme() match { case "spark" => getClassFileInputStreamFromSparkRPC case _ => val fileSystem = FileSystem.get(uri, SparkHadoopUtil.get.newConfiguration(conf)) getClassFileInputStreamFromFileSystem(fileSystem) } override def getResource(name: String): URL = { parentLoader.getResource(name) } override def getResources(name: String): java.util.Enumeration[URL] = { parentLoader.getResources(name) } override def getResourceAsStream(name: String): InputStream = { if (userClassPathFirst) { val res = getClassResourceAsStreamLocally(name) if (res != null) res else parentLoader.getResourceAsStream(name) } else { val res = parentLoader.getResourceAsStream(name) if (res != null) res else getClassResourceAsStreamLocally(name) } } private def getClassResourceAsStreamLocally(name: String): InputStream = { // Class files can be dynamically generated from the REPL. Allow this class loader to // load such files for purposes other than loading the class. try { if (name.endsWith(".class")) fetchFn(name) else null } catch { // The helper functions referenced by fetchFn throw CNFE to indicate failure to fetch // the class. It matches what IOException was supposed to be used for, and // ClassLoader.getResourceAsStream() catches IOException and returns null in that case. // So we follow that model and handle CNFE here. case _: ClassNotFoundException => null } } override def findClass(name: String): Class[_] = { if (userClassPathFirst) { findClassLocally(name).getOrElse(parentLoader.loadClass(name)) } else { try { parentLoader.loadClass(name) } catch { case e: ClassNotFoundException => val classOption = try { findClassLocally(name) } catch { case e: RemoteClassLoaderError => throw e case NonFatal(e) => // Wrap the error to include the class name // scalastyle:off throwerror throw new RemoteClassLoaderError(name, e) // scalastyle:on throwerror } classOption match { case None => throw new ClassNotFoundException(name, e) case Some(a) => a } } } } // See org.apache.spark.network.server.TransportRequestHandler.processStreamRequest. private val STREAM_NOT_FOUND_REGEX = s"Stream '.*' was not found.".r.pattern private def getClassFileInputStreamFromSparkRPC(path: String): InputStream = { val channel = env.rpcEnv.openChannel(s"$classUri/${urlEncode(path)}") new FilterInputStream(Channels.newInputStream(channel)) { override def read(): Int = toClassNotFound(super.read()) override def read(b: Array[Byte], offset: Int, len: Int) = toClassNotFound(super.read(b, offset, len)) private def toClassNotFound(fn: => Int): Int = { try { fn } catch { case e: RuntimeException if e.getMessage != null && STREAM_NOT_FOUND_REGEX.matcher(e.getMessage).matches() => // Convert a stream not found error to ClassNotFoundException. // Driver sends this explicit acknowledgment to tell us that the class was missing. throw new ClassNotFoundException(path, e) case NonFatal(e) => // scalastyle:off throwerror throw new RemoteClassLoaderError(path, e) // scalastyle:on throwerror } } } } private def getClassFileInputStreamFromFileSystem(fileSystem: FileSystem)( pathInDirectory: String): InputStream = { val path = new Path(directory, pathInDirectory) try { fileSystem.open(path) } catch { case _: FileNotFoundException => throw new ClassNotFoundException(s"Class file not found at path $path") } } def findClassLocally(name: String): Option[Class[_]] = { val pathInDirectory = name.replace('.', '/') + ".class" var inputStream: InputStream = null try { inputStream = fetchFn(pathInDirectory) val bytes = readAndTransformClass(name, inputStream) Some(defineClass(name, bytes, 0, bytes.length)) } catch { case e: ClassNotFoundException => // We did not find the class logDebug(s"Did not load class $name from REPL class server at $uri", e) None case e: Exception => // Something bad happened while checking if the class exists logError(s"Failed to check existence of class $name on REPL class server at $uri", e) if (userClassPathFirst) { // Allow to try to load from "parentLoader" None } else { throw e } } finally { if (inputStream != null) { try { inputStream.close() } catch { case e: Exception => logError("Exception while closing inputStream", e) } } } } def readAndTransformClass(name: String, in: InputStream): Array[Byte] = { if (name.startsWith("line") && name.endsWith("$iw$")) { // Class seems to be an interpreter "wrapper" object storing a val or var. // Replace its constructor with a dummy one that does not run the // initialization code placed there by the REPL. The val or var will // be initialized later through reflection when it is used in a task. val cr = new ClassReader(in) val cw = new ClassWriter( ClassWriter.COMPUTE_FRAMES + ClassWriter.COMPUTE_MAXS) val cleaner = new ConstructorCleaner(name, cw) cr.accept(cleaner, 0) return cw.toByteArray } else { // Pass the class through unmodified val bos = new ByteArrayOutputStream val bytes = new Array[Byte](4096) var done = false while (!done) { val num = in.read(bytes) if (num >= 0) { bos.write(bytes, 0, num) } else { done = true } } return bos.toByteArray } } /** * URL-encode a string, preserving only slashes */ def urlEncode(str: String): String = { str.split('/').map(part => URLEncoder.encode(part, UTF_8.name())).mkString("/") } } class ConstructorCleaner(className: String, cv: ClassVisitor) extends ClassVisitor(ASM7, cv) { override def visitMethod(access: Int, name: String, desc: String, sig: String, exceptions: Array[String]): MethodVisitor = { val mv = cv.visitMethod(access, name, desc, sig, exceptions) if (name == "" && (access & ACC_STATIC) == 0) { // This is the constructor, time to clean it; just output some new // instructions to mv that create the object and set the static MODULE$ // field in the class to point to it, but do nothing otherwise. mv.visitCode() mv.visitVarInsn(ALOAD, 0) // load this mv.visitMethodInsn(INVOKESPECIAL, "java/lang/Object", "", "()V", false) mv.visitVarInsn(ALOAD, 0) // load this // val classType = className.replace('.', '/') // mv.visitFieldInsn(PUTSTATIC, classType, "MODULE$", "L" + classType + ";") mv.visitInsn(RETURN) mv.visitMaxs(-1, -1) // stack size and local vars will be auto-computed mv.visitEnd() return null } else { return mv } } } /** * An error when we cannot load a class due to exceptions. We don't know if this class exists, so * throw a special one that's neither [[LinkageError]] nor [[ClassNotFoundException]] to make JVM * retry to load this class later. */ private[repl] class RemoteClassLoaderError(className: String, cause: Throwable) extends Error(className, cause) ================================================ FILE: fire-shell/spark-shell/src/main/scala-spark-3.0/org/apache/spark/repl/Signaling.scala ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.repl import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.util.SignalUtils object Signaling extends Logging { /** * Register a SIGINT handler, that terminates all active spark jobs or terminates * when no jobs are currently running. * This makes it possible to interrupt a running shell job by pressing Ctrl+C. */ def cancelOnInterrupt(): Unit = SignalUtils.register("INT") { SparkContext.getActive.map { ctx => if (!ctx.statusTracker.getActiveJobIds().isEmpty) { logWarning("Cancelling all active jobs, this can take a while. " + "Press Ctrl+C again to exit now.") ctx.cancelAllJobs() true } else { false } }.getOrElse(false) } } ================================================ FILE: pom.xml ================================================ 4.0.0 com.zto.fire fire-parent pom 2.3.2-SNAPSHOT Fire : ${project.version} 0.9.0 2.5 0.3.1 provided 0.11.0.2 2.8.0 2.6.0 1.1.0 1.2.1 1.1.0 org.apache.hive 1.2.0 2.5.30 4.13.2 4.8.0 5.1.49 15.0 2.6.0 3.5.4 1.26.0 4.1.17.Final UTF-8 ${scala.binary.version}.${scala.minor.version} ${spark.version}_${scala.binary.version} ${flink.version}_${scala.binary.version} fire-common fire-core fire-metrics fire-examples fire-connectors fire-engines fire-enhance fire-shell fire-platform scala-2.11 2.11 8 scala-2.12 true 2.12 13 spark-2.3.2 2.3.2 2.3 2.6.7 spark-2.4.8 2.4.8 2.4 2.6.7 spark-3.0.2 true 3.0.2 3.0 2.10.5 org.apache.spark spark-avro_${scala.binary.version} ${spark.version} ${maven.scope} spark-3.1.3 3.1.3 3.1 2.10.5 org.apache.spark spark-avro_${scala.binary.version} ${spark.version} ${maven.scope} spark-3.2.1 3.2.1 3.2 2.10.5 org.apache.spark spark-avro_${scala.binary.version} ${spark.version} ${maven.scope} spark-3.3.0 3.3.0 3.3 2.10.5 org.apache.spark spark-avro_${scala.binary.version} ${spark.version} ${maven.scope} hadoop-2.7 true org.spark-project.hive 1.2.1.spark2 org.apache.spark spark-hive_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.hive hive-common org.apache.hive hive-exec org.apache.hive hive-metastore org.apache.hive hive-serde org.apache.hive hive-shims org.apache.spark spark-hive-thriftserver_${scala.binary.version} ${spark.version} ${maven.scope} org.apache.hive hive-cli org.apache.hive hive-jdbc org.apache.hive hive-beeline ${hive.group} hive-cli ${hive.version} ${maven.scope} ${hive.group} hive-jdbc ${hive.version} ${maven.scope} ${hive.group} hive-beeline ${hive.version} ${maven.scope} ${hive.group} hive-common ${hive.version} ${maven.scope} ${hive.group} hive-metastore ${hive.version} ${maven.scope} ${hive.group} hive-exec ${hive.version} ${maven.scope} org.apache.commons commons-lang3 org.apache.spark spark-core_2.10 hadoop-3.2 3.2.0 org.apache.spark spark-hive_${scala.binary.version} ${spark.version} ${maven.scope} flink-1.12.2 1.12.2 1.12 org.apache.flink flink-table-planner-blink_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-queryable-state-runtime_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-runtime_${scala.binary.version} ${flink.version} ${maven.scope} flink-1.13.0 1.13.0 1.13 org.apache.flink flink-table-planner-blink_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-queryable-state-runtime_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-runtime_${scala.binary.version} ${flink.version} ${maven.scope} flink-1.14.3 true 1.14.3 1.14 org.apache.flink flink-table-planner_${scala.binary.version} ${flink.version} ${maven.scope} org.apache.flink flink-queryable-state-runtime ${flink.version} ${maven.scope} org.scala-lang scala-library ${scala.version} org.scala-lang scala-compiler ${scala.version} org.scala-lang scala-reflect ${scala.version} com.google.guava guava ${guava.version} ${maven.scope} junit junit ${junit.version} test com.fasterxml.jackson.core jackson-databind ${jackson.version} ${maven.scope} commons-io commons-io 2.4 ${maven.scope} org.apache.commons commons-lang3 3.5 ${maven.scope} log4j log4j 1.2.17 ${maven.scope} com.esotericsoftware kryo 4.0.0 ${maven.scope} com.sparkjava spark-core ${sparkjava.version} org.quartz-scheduler quartz 2.3.1 com.github.oshi oshi-core 3.12.2 ${maven.scope} cloudera https://repository.cloudera.com/artifactory/cloudera-repos true true aliyun https://maven.aliyun.com/repository/central true true central https://mirrors.huaweicloud.com/repository/maven/ true true cloudera https://repository.cloudera.com/artifactory/cloudera-repos true true aliyun https://maven.aliyun.com/repository/central true true central https://mirrors.huaweicloud.com/repository/maven/ true true true org.apache.maven.plugins maven-compiler-plugin 1.8 1.8 org.scala-tools maven-scala-plugin 2.15.2 scala-compile-first process-resources compile scala-test-compile process-test-resources testCompile org.codehaus.mojo build-helper-maven-plugin add-source generate-sources add-source src/main/java src/main/scala src/main/java-spark-${spark.major.version} src/main/scala-spark-${spark.major.version} src/main/java-flink-${flink.major.version} src/main/scala-flink-${flink.major.version} add-test-source generate-test-sources add-test-source src/test/java src/test/scala src/test/java-spark-${spark.major.version} src/test/scala-spark-${spark.major.version} src/test/java-flink-${flink.major.version} src/test/scala-flink-${flink.major.version} org.apache.maven.plugins maven-eclipse-plugin 2.10 true true org.scala-ide.sdt.core.scalanature org.eclipse.jdt.core.javanature org.scala-ide.sdt.core.scalabuilder org.scala-ide.sdt.launching.SCALA_CONTAINER org.eclipse.jdt.launching.JRE_CONTAINER org.scala-lang:scala-library org.scala-lang:scala-compiler **/*.scala **/*.java org.apache.maven.plugins maven-surefire-plugin 2.19.1 **/*.java **/*.scala org.apache.maven.plugins maven-shade-plugin 2.4.2 package shade *:* META-INF/*.SF META-INF/*.DSA META-INF/*.RSA zto-${project.artifactId}-${project.version}